# Table of contents -------------------------------------------------------
# 1. vdemdata CHECK
# 2. Rainbowmap CHECK
# 3. Economist Democracy Scores for 2018 CHECK
# 4. Happiness CHECK
# 5. GDP per capita CHECK
# 6. ESS round 9 CHECK
# 7. Unemployment rate CHECK
# 8. Gender equality index CHECK
## further ideas:
# which political parties governed in the years before (i.e., left, centre-right etc.)
# Gini index
# migrant acceptance scores
# welfare state/ size of government relative to GDP
# physical/ mental health issues; quality and equity of healthcare
# solidarity with minority groups
# education
# load libraries ----------------------------------------------------------
# devtools::install_github("vdeminstitute/vdemdata")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(readxl)
library(vdemdata)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loading required package: survival
##
## Attaching package: 'survey'
##
## The following object is masked from 'package:graphics':
##
## dotchart
library(countrycode)
library(rvest)
##
## Attaching package: 'rvest'
##
## The following object is masked from 'package:readr':
##
## guess_encoding
library(stringr)
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Loading required package: colorspace
## VIM is ready to use.
##
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
##
## The following object is masked from 'package:datasets':
##
## sleep
library(visdat)
library(naniar)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:survival':
##
## cluster
##
## The following object is masked from 'package:purrr':
##
## lift
# mapping -----------------------------------------------------------------
# we comment out the country codes as we likely don't need them
survey_country_mapping <- data.frame(
country_name = c("France", "Belgium", "Netherlands", "Germany", "Italy",
"Luxembourg", "Denmark", "Ireland", "United Kingdom", "Greece",
"Spain", "Portugal", "Finland", "Sweden",
"Austria", "Cyprus", "Czech Republic", "Estonia", "Hungary",
"Latvia", "Lithuania", "Malta", "Poland", "Slovakia",
"Slovenia", "Bulgaria", "Romania", "Croatia"),
#survey_country_code = c(1, 2, 3, 4, 5,
# 6, 7, 8, 9, 11,
# 12, 13, 16, 17,
# 18, 19, 20, 21, 22,
# 23, 24, 25, 26, 27,
# 28, 29, 30, 32),
iso2 = c("FR", "BE", "NL", "DE", "IT",
"LU", "DK", "IE", "GB", "GR",
"ES", "PT", "FI", "SE",
"AT", "CY", "CZ", "EE", "HU",
"LV", "LT", "MT", "PL", "SK",
"SI", "BG", "RO", "HR"),
stringsAsFactors = FALSE)
# create a mapping for country name standardization: the Czech Republic is a problem
country_name_mapping <- c("Czechia" = "Czech Republic")
# 1. vdemdata ----------------------------------------------------------------
# install the package from GitHub first:
# devtools::install_github("vdeminstitute/vdemdata")
vdem_data <- vdem
# apply country name standardization before filtering
vdem_data_standardized <- vdem_data %>%
mutate(
# Standardize country names
country_name = ifelse(country_name %in% names(country_name_mapping),
country_name_mapping[country_name],
country_name))
# filter for most recent data (2019 to match survey data)
vdem_2019 <- vdem_data_standardized %>%
filter(country_name %in% survey_country_mapping$country_name, year == 2019) %>%
select(country_name, v2x_libdem, v2x_polyarchy, v2x_gender,
v2x_egaldem, v2x_liberal, v2xcs_ccsi, v2x_freexp) # select relevant variables
saveRDS(vdem_2019, file = "vdem_eu_2019.rds")
write.csv(vdem_2019, "vdem_eu_2019.csv")
## from the codebook:
# v2x_libdem: index of liberal democracy
# v2x_polyarchy: index of electoral democracy
# v2x_gender: index of women's political empowerment
# v2x_egaldem: index of egalitarian democracy
# v2x_liberal: index of civil liberties
# v2xcs_ccsi: index of civil society participation
# v2x_freexp: index of freedom of expression
# 2. Rainbowmap --------------------------------------------------------------
# https://rainbowmap.ilga-europe.org/
# rainbow_data <- read_csv("data/raw/2024-rainbow-map-data.csv")
# but the problem: that's for 2024, not 2019 or before 2019
# hence, we would need to get the data for 2019 and years before:
# create data frames for each year
df_2019 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Norway", "France", "Finland",
"Denmark", "Spain", "Portugal", "Sweden", "Netherlands", "Austria",
"Germany", "Croatia", "Greece", "Ireland", "Hungary", "Luxembourg",
"Iceland", "Slovenia", "Montenegro", "Estonia", "Switzerland", "Georgia",
"Bosnia & Herzegovina", "Slovakia", "Albania", "Serbia", "Bulgaria",
"Czechia", "Kosovo", "Andorra", "Cyprus", "Romania", "Ukraine",
"Lithuania", "Italy", "Poland", "Latvia", "Belarus", "Moldova",
"Russia", "North Macedonia", "Liechtenstein", "San Marino", "Armenia",
"Turkey", "Monaco", "Azerbaijan"),
Value = c(74.72, 70.40, 67.62, 63.64, 62.73, 62.20, 60.31, 58.49, 57.50, 52.86,
49.72, 48.54, 47.45, 45.83, 44.82, 44.72, 42.83, 41.34, 40.20, 39.82,
37.16, 34.38, 30.06, 29.49, 29.37, 29.04, 27.74, 26.43, 25.75, 25.50,
25.34, 23.93, 22.03, 20.67, 19.97, 19.91, 19.43, 18.10, 16.83, 15.82,
15.10, 11.75, 10.88, 10.08, 9.87, 8.50, 8.50, 7.31, 5.67),
Year = 2019)
df_2018 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Finland", "France", "Norway",
"Portugal", "Denmark", "Spain", "Sweden", "Netherlands", "Germany",
"Austria", "Greece", "Ireland", "Croatia", "Slovenia", "Luxembourg",
"Iceland", "Hungary", "Estonia", "Switzerland", "Montenegro", "Andorra",
"Albania", "Kosovo", "Bosnia & Herzegovina", "Serbia", "Czechia",
"Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Lithuania",
"Romania", "Ukraine", "Poland", "Liechtenstein", "Latvia",
"North Macedonia", "Belarus", "Moldova", "San Marino", "Russia",
"Monaco", "Turkey", "Armenia", "Azerbaijan"),
Value = c(91.94, 78.70, 73.48, 73.27, 72.81, 72.74, 69.16, 67.69, 67.03, 60.10,
59.64, 59.00, 56.40, 52.32, 52.22, 50.58, 47.73, 47.48, 47.22, 47.16,
39.34, 38.44, 37.74, 34.81, 33.24, 32.98, 31.38, 29.68, 29.20, 28.95,
28.65, 28.82, 25.87, 24.15, 21.78, 21.12, 20.95, 18.23, 17.87, 16.07,
14.03, 13.35, 13.08, 12.32, 10.80, 9.78, 8.60, 7.20, 4.70),
Year = 2018)
df_2017 <- data.frame(
Country = c("Malta", "Norway", "United Kingdom", "Belgium", "France", "Portugal",
"Finland", "Denmark", "Spain", "Netherlands", "Croatia", "Sweden",
"Austria", "Germany", "Ireland", "Iceland", "Greece", "Luxembourg",
"Hungary", "Slovenia", "Montenegro", "Andorra", "Estonia", "Albania",
"Bosnia & Herzegovina", "Switzerland", "Kosovo", "Serbia", "Czechia",
"Cyprus", "Slovakia", "Italy", "Georgia", "Bulgaria", "Romania",
"Ukraine", "Poland", "Liechtenstein", "Lithuania", "Latvia",
"North Macedonia", "Belarus", "Moldova", "San Marino", "Monaco",
"Turkey", "Armenia", "Azerbaijan"),
Value = c(77.74, 75.73, 71.86, 70.82, 69.16, 68.27, 67.69, 67.03, 64.44, 62.36,
60.10, 55.58, 54.41, 52.22, 47.22, 46.92, 46.48, 44.82, 44.28, 38.64,
34.81, 33.31, 33.24, 31.34, 30.94, 30.48, 29.68, 29.20, 28.95, 27.60,
26.67, 25.87, 23.15, 21.12, 19.00, 18.23, 17.87, 17.28, 17.12, 16.03,
13.35, 13.08, 12.32, 9.78, 8.60, 7.20, 6.40, 4.70),
Year = 2017)
df_2016 <- data.frame(
Country = c("Malta", "Belgium", "United Kingdom", "Spain", "Denmark", "Portugal",
"Finland", "France", "Croatia", "Netherlands", "Norway", "Sweden",
"Austria", "Iceland", "Greece", "Germany", "Ireland", "Hungary",
"Luxembourg", "Montenegro", "Estonia", "Albania", "Switzerland",
"Andorra", "Serbia", "Cyprus", "Slovenia", "Czechia", "Kosovo",
"Georgia", "Bosnia & Herzegovina", "Slovakia", "Bulgaria", "Romania",
"Italy", "Poland", "Liechtenstein", "Lithuania", "Latvia",
"North Macedonia", "San Marino", "Moldova", "Belarus", "Monaco",
"Turkey", "Armenia", "Azerbaijan"),
Value = c(77.75, 81.85, 79.19, 70.95, 70.90, 69.55, 67.25, 66.60, 66.55, 66.10,
65.15, 64.85, 62.21, 59.00, 58.30, 55.14, 54.70, 51.40, 50.35, 45.20,
38.25, 34.40, 33.15, 32.10, 32.00, 31.95, 31.65, 31.60, 31.55, 30.35,
29.40, 29.20, 24.00, 23.45, 19.75, 18.30, 18.20, 18.10, 17.35, 15.55,
14.40, 14.15, 13.35, 10.80, 8.75, 7.20, 4.85),
Year = 2016)
df_2015 <- data.frame(
Country = c("United Kingdom", "Belgium", "Malta", "Sweden", "Croatia", "Netherlands",
"Norway", "Spain", "Denmark", "Portugal", "France", "Iceland", "Finland",
"Germany", "Austria", "Hungary", "Montenegro", "Luxembourg", "Albania",
"Ireland", "Greece", "Georgia", "Czechia", "Estonia", "Slovenia",
"Andorra", "Bosnia & Herzegovina", "Serbia", "Slovakia", "Romania",
"Switzerland", "Bulgaria", "Poland", "Italy", "Liechtenstein",
"Lithuania", "Cyprus", "Kosovo", "Latvia", "Moldova", "Belarus",
"San Marino", "North Macedonia", "Turkey", "Monaco", "Armenia",
"Azerbaijan"),
Value = c(88.00, 83.00, 77.00, 72.00, 71.00, 69.00, 69.00, 69.00, 68.00, 67.00,
65.00, 63.00, 62.00, 56.00, 52.00, 50.00, 46.00, 43.00, 42.00, 40.00,
39.00, 36.00, 35.00, 34.00, 32.00, 31.00, 29.00, 29.00, 29.00, 28.00,
28.00, 27.00, 26.00, 22.00, 19.00, 19.00, 18.00, 18.00, 18.00, 16.00,
14.00, 14.00, 13.00, 12.00, 11.00, 9.00, 5.00),
Year = 2015)
df_2014 <- data.frame(
Country = c("United Kingdom", "Belgium", "Spain", "Netherlands", "Norway",
"Portugal", "Sweden", "France", "Iceland", "Denmark", "Malta",
"Croatia", "Germany", "Hungary", "Austria", "Montenegro", "Finland",
"Albania", "Slovenia", "Czechia", "Estonia", "Ireland", "Greece",
"Slovakia", "Serbia", "Bulgaria", "Switzerland", "Luxembourg",
"Romania", "Poland", "Italy", "Georgia", "Lithuania", "Andorra",
"Bosnia & Herzegovina", "Cyprus", "Latvia", "Liechtenstein", "Kosovo",
"Moldova", "Turkey", "San Marino", "Belarus", "North Macedonia",
"Ukraine", "Monaco", "Armenia", "Azerbaijan"),
Value = c(80.25, 78.10, 73.26, 69.90, 68.40, 66.60, 65.30, 64.10, 63.95, 59.90,
56.80, 56.30, 55.68, 53.65, 52.10, 47.05, 45.30, 38.40, 35.00, 34.65,
34.65, 33.65, 31.15, 30.50, 30.30, 30.00, 28.85, 28.35, 27.95, 27.65,
27.40, 28.05, 21.70, 20.60, 20.10, 19.65, 19.65, 18.00, 17.10, 16.50,
14.15, 13.70, 13.60, 13.30, 11.65, 10.10, 8.50, 6.60),
Year = 2014)
df_2013 <- data.frame(
Country = c("United Kingdom", "Belgium", "Norway", "Sweden", "Spain", "Portugal",
"France", "Netherlands", "Denmark", "Iceland", "Hungary", "Germany",
"Croatia", "Finland", "Austria", "Albania", "Malta", "Slovenia",
"Czechia", "Ireland", "Romania", "Estonia", "Switzerland", "Luxembourg",
"Greece", "Slovakia", "Montenegro", "Serbia", "Poland", "Georgia",
"Lithuania", "Andorra", "Bosnia & Herzegovina", "Cyprus", "Latvia",
"Italy", "Bulgaria", "Liechtenstein", "Turkey", "San Marino", "Belarus",
"Kosovo", "North Macedonia", "Ukraine", "Monaco", "Armenia", "Azerbaijan"),
Value = c(78.50, 68.73, 65.65, 65.30, 65.04, 64.60, 64.10, 60.00, 59.80, 55.50,
54.70, 54.29, 48.30, 47.25, 43.35, 38.40, 35.30, 35.00, 34.65, 33.65,
31.30, 28.90, 28.85, 28.35, 28.10, 28.90, 28.65, 25.05, 21.65, 21.05,
20.70, 20.60, 19.95, 19.65, 19.65, 19.40, 18.00, 15.50, 14.15, 13.70,
13.60, 13.50, 13.30, 11.65, 10.10, 7.50, 7.10),
Year = 2013)
# combine all data frames into one
df_combined <- bind_rows(df_2019, df_2018, df_2017, df_2016, df_2015, df_2014, df_2013)
# create new, compressed df
# step 1: Filter data for 2019 and 2018
df_2019 <- df_combined %>% filter(Year == 2019) %>% rename(Value_2019 = Value)
df_2018 <- df_combined %>% filter(Year == 2018) %>% rename(Value_2018 = Value)
# step 2: Filter data for 2013 and 2014 and calculate the average
df_2013_2014 <- df_combined %>% filter(Year %in% c(2013, 2014)) %>%
group_by(Country) %>%
summarise(Avg_2013_2014 = mean(Value, na.rm = TRUE))
# step 3: Join the data frames for 2019 and 2018
df_compressed <- df_2019 %>%
left_join(df_2018, by = "Country") %>%
select(Country, Value_2019, Value_2018)
# step 4: Calculate the average for 2019 and 2018
df_compressed <- df_compressed %>%
mutate(Avg_2019_2018 = (Value_2019 + Value_2018) / 2)
# step 5: Join the average for 2013 and 2014
df_compressed <- df_compressed %>%
left_join(df_2013_2014, by = "Country")
# step 6: Calculate the difference between the averages
df_compressed <- df_compressed %>%
mutate(Difference = Avg_2019_2018 - Avg_2013_2014)
# step 7: Select and reorder columns for the final compressed data frame
df_compressed <- df_compressed %>%
select(Country, Value_2019, Value_2018, Avg_2019_2018,
Avg_2013_2014, Difference)
rainbow_df <- df_compressed
# save the data frame
saveRDS(rainbow_df, file = "rainbow_df.rds")
write.csv(rainbow_df, "rainbow_df.csv")
# 3. The Economist: Democracy scores 2018 ---------------------------------
# https://enperspectiva.uy/wp-content/uploads/2019/01/Democracy_Index_2018.pdf
democracy_scores <- data.frame(
Country = c("Belgium", "Denmark", "Greece", "Spain", "Finland", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
ISO2 = c("BE", "DK", "GR", "ES", "FI", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
Overall_score = c(7.78, 9.22, 7.29, 8.08, 9.14, 7.80, 9.15, 7.71, 8.81, 8.89, 8.29, 7.84, 9.39, 8.68, 8.53, 7.03, 7.59, 7.69, 7.97, 6.63, 7.38, 7.50, 8.21, 6.67, 6.38, 7.10, 7.50, 6.57),
#Global_rank = c(31, 5, 39, 19, 8, 29, "6=", 33, 12, 11, 16, 27, 3, 13, 14, 46, 35, 34, "23=", "57=", 38, "36=", 18, "54=", "66=", 44, "36=", 60),
#Regional_rank = c(17, 4, 20, 14, 6, 16, 5, 18, 9, 8, 12, 15, 3, 10, 11, 7, 19, 2, 1, 9, 5, "3=", 13, 8, 12, 6, "3=", 10),
Electoral_process_and_pluralism = c(9.58, 10.00, 9.58, 9.17, 10.00, 9.58, 9.58, 9.58, 10.00, 9.58, 9.58, 9.58, 9.58, 9.58, 9.58, 9.17, 9.17, 9.58, 9.58, 8.75, 9.58, 9.58, 9.17, 9.17, 9.17, 9.58, 9.58, 9.17),
Functioning_of_government = c(8.93, 9.29, 5.36, 7.14, 8.93, 7.50, 7.86, 6.07, 8.93, 9.29, 7.86, 7.50, 9.64, 8.57, 7.50, 6.43, 6.43, 6.79, 8.21, 6.07, 6.07, 6.43, 8.21, 6.07, 5.71, 6.79, 6.79, 6.07),
Political_participation = c(5.00, 8.33, 6.11, 7.78, 8.33, 7.78, 8.33, 7.78, 6.67, 8.33, 8.33, 6.11, 8.33, 8.33, 8.33, 7.22, 6.67, 6.67, 6.67, 5.00, 5.56, 6.11, 6.11, 6.11, 5.00, 5.56, 6.67, 5.56),
Political_culture = c(6.88, 9.38, 6.88, 7.50, 8.75, 5.63, 10.00, 6.88, 8.75, 8.13, 6.88, 6.88, 10.00, 7.50, 8.13, 4.38, 6.88, 6.88, 6.88, 6.25, 6.88, 6.25, 8.75, 4.38, 4.38, 5.63, 6.25, 5.00),
Civil_liberties = c(8.53, 9.12, 8.53, 8.82, 9.71, 8.53, 10.00, 8.24, 9.71, 9.12, 8.82, 9.12, 9.41, 9.41, 9.12, 7.94, 8.82, 8.53, 8.53, 7.06, 8.82, 9.12, 8.82, 7.65, 7.65, 7.94, 8.24, 7.06),
Regime_type = c("Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Full democracy", "Full democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Full democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy", "Flawed democracy"))
saveRDS(democracy_scores, file = "democracy_scores.rds")
write.csv(democracy_scores, "democracy_scores.csv")
# 4. Happiness data 2018 ---------------------------------------------------------------------
# https://s3.amazonaws.com/happiness-report/2018/WHR_web.pdf
happiness_scores <- data.frame(
Country = c("Finland", "Denmark", "Greece", "Spain", "France", "Ireland", "Italy", "Luxembourg", "Netherlands", "Austria", "Portugal", "Sweden", "Germany", "United Kingdom", "Bulgaria", "Cyprus", "Czech Republic", "Estonia", "Hungary", "Latvia", "Lithuania", "Malta", "Poland", "Romania", "Slovakia", "Slovenia", "Croatia"),
ISO2 = c("FI", "DK", "GR", "ES", "FR", "IE", "IT", "LU", "NL", "AT", "PT", "SE", "DE", "GB", "BG", "CY", "CZ", "EE", "HU", "LV", "LT", "MT", "PL", "RO", "SK", "SI", "HR"),
Happiness_Score = c(7.632, 7.555, 5.358, 6.310, 6.489, 6.977, 6.000, 6.910, 7.441, 7.139, 5.410, 7.314, 6.965, 6.814, 4.933, 5.762, 6.711, 5.739, 5.620, 5.933, 5.952, 6.627, 6.123, 5.945, 6.173, 5.948, 5.321))
saveRDS(happiness_scores, file = "happiness_scores.rds")
write.csv(happiness_scores, "happiness_scores.csv")
# 5. GDP per capita ---------------------------------------------------------------------
df_GDP <- read_csv("data/raw/data_20250228194704.csv")
## New names:
## Rows: 1064 Columns: 10
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (7): IndicatorName, VariableName, MeasurementName, CountryCode, Alpha3Co... dbl
## (2): IndicatorCode, PeriodCode lgl (1): ...10
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...10`
# apply the mapping to the CountryName column
df_GDP <- df_GDP %>%
mutate(
# replace Czechia with Czech Republic
CountryName = ifelse(CountryName %in% names(country_name_mapping),
country_name_mapping[CountryName],
CountryName)) %>%
select("CountryName", "PeriodCode", "Value") %>%
# now filter after standardizing the names
filter(CountryName %in% survey_country_mapping$country_name)
df_GDP <- df_GDP %>%
mutate(Value = as.numeric(as.character(Value)))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `Value = as.numeric(as.character(Value))`.
## Caused by warning:
## ! NAs introduced by coercion
df_GDP <- df_GDP %>%
# group by country
group_by(CountryName) %>%
# find first and last year values
summarize(
gdp_2005 = Value[PeriodCode == 2005],
gdp_2018 = Value[PeriodCode == 2018],
# calculate relative growth
gdp_growth = (gdp_2018 - gdp_2005) / gdp_2005 * 100) %>%
# add ISO2 codes for easier joining with other datasets
left_join(survey_country_mapping, by = c("CountryName" = "country_name")) %>%
# select relevant columns
select(CountryName, iso2, gdp_2005, gdp_2018, gdp_growth)
saveRDS(df_GDP, file = "df_GDP.rds")
write.csv(df_GDP, "df_GDP.csv")
# 6. ESS Round 9 ----------------------------------------------------------
# https://ess.sikt.no/en/datafile/b2b0bf39-176b-4eca-8d26-3c05ea83d2cb
ess_data <- read_csv("data/raw/ESS9e03_2.csv")
## Rows: 49519 Columns: 572
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): name, proddate, cntry, ctzshipd, cntbrthd, lnghom1, lnghom2, fbrn...
## dbl (562): essround, edition, idno, dweight, pspwght, pweight, anweight, nws...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# select interesting variables, country and weight variables
ess_selected <- ess_data %>%
select(
# identifiers and weights
cntry, # country code
pspwght, # post-stratification weight
dweight, # design weight
# key variables of interest
freehms, # gays and lesbians free to live life as they wish
lrscale, # left-right political scale
rlgdgr, # how religious are you
ipeqopt, # important that people are treated equally
atchctr, # attachment to country
eduyrs, # years of education
agea) # age of respondent
# function to recode ESS special values (negative values are typically missing values)
recode_ess_missing <- function(x) {
ifelse(x < 0, NA, x)
}
# clean the data
ess_clean <- ess_selected %>%
# recode special values to NA
mutate(across(c(freehms, lrscale, rlgdgr, ipeqopt, atchctr, eduyrs, agea),
recode_ess_missing)) %>%
# create derived variables if needed
mutate(
# recode freehms to 0-1 scale (originally 1-5 where 1 = agree strongly)
freehms_support = case_when(
freehms %in% c(1, 2) ~ 1, # agree and strongly agree
freehms %in% c(3, 4, 5) ~ 0, # neutral, disagree, strongly disagree
TRUE ~ NA_real_),
# create age groups
age_group = case_when(
agea < 35 ~ "18-34",
agea < 55 ~ "35-54",
TRUE ~ "55+"
),
# standardise left-right scale to 0-1
lrscale_std = (lrscale - 1) / 9, # Original scale is 1-10
# create high education indicator (above country median)
high_educ = NA # will fill this in after calculating country medians
)
# calculate country median education for relative education measure
country_medians <- ess_clean %>%
group_by(cntry) %>%
summarize(median_educ = median(eduyrs, na.rm = TRUE))
# join back to main data and create high education indicator
ess_clean <- ess_clean %>%
left_join(country_medians, by = "cntry") %>%
mutate(high_educ = ifelse(eduyrs > median_educ, 1, 0))
# calculate weighted means by country
country_aggregates <- ess_clean %>%
# group by country
group_by(cntry) %>%
# calculate weighted statistics
summarize(
# sample size
n_respondents = n(),
n_valid = sum(!is.na(freehms)),
# weighted means
pct_lgbt_support = weighted.mean(freehms_support, w = pspwght, na.rm = TRUE) * 100,
mean_religiosity = weighted.mean(rlgdgr, w = pspwght, na.rm = TRUE),
mean_left_right = weighted.mean(lrscale_std, w = pspwght, na.rm = TRUE),
mean_equal_values = weighted.mean(ipeqopt, w = pspwght, na.rm = TRUE),
mean_country_attach = weighted.mean(atchctr, w = pspwght, na.rm = TRUE),
mean_eduyrs = weighted.mean(eduyrs, w = pspwght, na.rm = TRUE),
mean_age = weighted.mean(agea, w = pspwght, na.rm = TRUE),
# weighted proportions for categorical variables
pct_young = weighted.mean(age_group == "18-34", w = pspwght, na.rm = TRUE) * 100,
pct_high_educ = weighted.mean(high_educ, w = pspwght, na.rm = TRUE) * 100,
# standard errors (for confidence intervals)
se_lgbt_support = sd(freehms_support, na.rm = TRUE) / sqrt(sum(!is.na(freehms_support))),
# missing data proportions
pct_missing_lgbt = mean(is.na(freehms)) * 100)
# calculate cross-variable country indicators
country_indicators <- ess_clean %>%
group_by(cntry) %>%
summarize(
# correlation between age and LGBT support within country
age_lgbt_corr = cor(agea, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
# correlation between religiosity and LGBT support
relig_lgbt_corr = cor(rlgdgr, freehms_support, use = "pairwise.complete.obs", method = "spearman"),
# inequality in LGBT support (standard deviation)
lgbt_support_inequality = sd(freehms_support, na.rm = TRUE),
# education gradient in LGBT support (difference between high and low education)
educ_gradient = weighted.mean(freehms_support[high_educ == 1], w = pspwght[high_educ == 1], na.rm = TRUE) -
weighted.mean(freehms_support[high_educ == 0], w = pspwght[high_educ == 0], na.rm = TRUE))
# join the aggregates and indicators
country_data_final <- country_aggregates %>%
left_join(country_indicators, by = "cntry") %>%
# create ISO country codes for easier merging with other datasets
mutate(
iso2c = countrycode(cntry, "iso2c", "iso2c"),
iso3c = countrycode(cntry, "iso2c", "iso3c"))
# plot LGBT support by country
ggplot(country_data_final, aes(x = reorder(cntry, pct_lgbt_support), y = pct_lgbt_support)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_errorbar(aes(ymin = pct_lgbt_support - 1.96*se_lgbt_support,
ymax = pct_lgbt_support + 1.96*se_lgbt_support),
width = 0.2) +
labs(title = "Support for LGBT Rights by Country",
subtitle = "Percent agreeing gays and lesbians should be free to live as they wish",
x = "Country",
y = "Support (%)") +
theme_minimal() +
coord_flip()
# examine relationship between religiosity and LGBT support
ggplot(country_data_final, aes(x = mean_religiosity, y = pct_lgbt_support, label = cntry)) +
geom_point(size = 3, alpha = 0.7) +
geom_text(hjust = -0.3, vjust = 0.3) +
geom_smooth(method = "lm", se = TRUE, color = "red") +
labs(title = "Religiosity vs. LGBT Support by Country",
x = "Mean Religiosity Score",
y = "LGBT Support (%)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
saveRDS(country_data_final, file = "country_data_final.rds")
write.csv(country_data_final, "country_data_final.csv")
# 7. Unemployment rate ----------------------------------------------------
# https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate
# scrape data from Wikipedia
url <- "https://en.wikipedia.org/wiki/List_of_European_Union_member_states_by_unemployment_rate"
page <- read_html(url)
tables <- html_table(page, fill = TRUE)
eu_unemployment_table <- tables[[1]]
# clean column names - simplify them to more standard names
colnames(eu_unemployment_table) <- c("Country", "Unemployment", "Employment", "Year")
# clean up the country names by removing footnote references
eu_unemployment_table$Country <- gsub("\\[.*?\\]", "", eu_unemployment_table$Country)
# make sure numeric columns are properly formatted
eu_unemployment_table$Unemployment <- as.character(eu_unemployment_table$Unemployment)
eu_unemployment_table$Employment <- as.numeric(as.character(eu_unemployment_table$Employment))
eu_unemployment_table$Year <- as.numeric(as.character(eu_unemployment_table$Year))
eu_unemployment_table <- eu_unemployment_table %>%
mutate(
# trim spaces from country names
Country = trimws(Country),
# convert Unemployment to numeric (remove any % signs or spaces if present)
Unemployment = as.numeric(gsub("[^0-9.]", "", Unemployment)))
# modify Greece's unemployment rate to the 2018 value
eu_unemployment_table$Unemployment[eu_unemployment_table$Country == "Greece"] <- 19.18
eu_unemployment_table$Year[eu_unemployment_table$Country == "Greece"] <- 2018
# add the UK data manually
uk_data <- data.frame(
Country = "United Kingdom",
Unemployment = 4.12, # from Statista
Employment = 75.25, # estimated from https://www.ons.gov.uk/employmentandlabourmarket/peopleinwork/employmentandemployeetypes/articles/singlemonthlabourforcesurveyestimates/december2018
Year = 2018)
# append UK data to the table
eu_unemployment_table <- rbind(eu_unemployment_table, uk_data)
# ignore the year column
eu_unemployment_table <- eu_unemployment_table %>%
select(-Year)
# save
saveRDS(eu_unemployment_table, file = "eu_unemployment_table.rds")
write.csv(eu_unemployment_table, "eu_unemployment_table.csv", row.names = FALSE)
# 8. Gender equality ------------------------------------------------------
# https://eige.europa.eu/gender-statistics/dgs/indicator/index__index_scores/datatable?time=2017&col=domain&row=geo
gender_eq_index <- read_xlsx("data/raw/index__index_scores.xlsx", range = "A16:V44")
gender_eq_index <- gender_eq_index %>%
select(country_name = "Geographic region\\(Sub-) Domain Scores",
gender_equality_index = "Overall Gender Equality Index") %>%
mutate(country_name = ifelse(country_name == "Czechia", "Czech Republic", country_name))
# Merge the data ----------------------------------------------------------
# create a base dataframe with country identifying variables
country_level_df <- survey_country_mapping
# merge V-Dem data
country_level_df <- country_level_df %>%
left_join(vdem_2019, by = c("country_name" = "country_name"))
# fix country name in democracy_scores for Czech Republic if needed
if(any(democracy_scores$Country == "Czechia")) {
democracy_scores$Country[democracy_scores$Country == "Czechia"] <- "Czech Republic"
}
# merge democracy scores
country_level_df <- country_level_df %>%
left_join(democracy_scores, by = c("iso2" = "ISO2"))
# merge GDP data
country_level_df <- country_level_df %>%
left_join(df_GDP, by = "iso2")
# rename some countries to match our country_name format
rainbow_country_mapping <- data.frame(
original = c("Czechia", "Andorra", "Bosnia & Herzegovina", "North Macedonia", "United Kingdom"),
standardized = c("Czech Republic", "Andorra", "Bosnia and Herzegovina", "Macedonia", "United Kingdom"),
stringsAsFactors = FALSE)
# apply standardized country names
for(i in 1:nrow(rainbow_country_mapping)) {
rainbow_df$Country[rainbow_df$Country == rainbow_country_mapping$original[i]] <-
rainbow_country_mapping$standardized[i]
}
# extract and rename rainbow map variables for clarity
rainbow_data_clean <- rainbow_df %>%
select(
Country,
rainbow_score_2019 = Value_2019,
rainbow_score_2018 = Value_2018,
rainbow_score_avg_2019_2018 = Avg_2019_2018,
rainbow_score_avg_2013_2014 = Avg_2013_2014,
rainbow_score_difference = Difference)
# merge Rainbow Map data
country_level_df <- country_level_df %>%
left_join(rainbow_data_clean, by = c("country_name" = "Country"))
# merge happiness data
country_level_df <- country_level_df %>%
left_join(happiness_scores, by = c("iso2" = "ISO2"))
# merge unemployment data
country_level_df <- country_level_df %>%
left_join(eu_unemployment_table, by = c("country_name" = "Country"))
# create a mapping between ESS country codes and ISO2
ess_country_mapping <- data.frame(
cntry = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
"DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO",
"SK", "SI", "ES", "SE", "GB"),
iso2 = c("AT", "BE", "BG", "HR", "CY", "CZ", "DK", "EE", "FI", "FR",
"DE", "HU", "IE", "IT", "LV", "LT", "NL", "PL", "PT", "RO",
"SK", "SI", "ES", "SE", "GB"),
stringsAsFactors = FALSE)
# first ensure cntry codes match our iso2 codes
country_data_final <- country_data_final %>%
left_join(ess_country_mapping, by = "cntry") %>%
select(-iso2c, -iso3c) # remove original ISO codes to avoid confusion
# rename variables for clarity
ess_data_clean <- country_data_final %>%
select(
cntry,
n_respondents,
n_valid,
lgbt_support_percent = pct_lgbt_support,
mean_religiosity,
mean_left_right,
mean_equal_values,
mean_country_attach,
mean_eduyrs,
mean_age,
pct_young,
pct_high_educ,
se_lgbt_support,
pct_missing_lgbt,
age_lgbt_corr,
relig_lgbt_corr,
lgbt_support_inequality,
educ_gradient,
#iso3c
)
# merge ESS data
country_level_df <- country_level_df %>%
left_join(ess_data_clean, by = c("iso2" = "cntry"))
country_level_df %>% View()
# delete the first Greece row
greece_rows <- which(country_level_df$country_name == "Greece")
if (length(greece_rows) > 1) {
# remove the first instance of Greece
country_level_df <- country_level_df[-greece_rows[1], ]
# verify the fix worked
greece_check <- country_level_df %>%
filter(country_name == "Greece")
print("Greece entries after removing the first instance:")
print(greece_check)
}
# add the gender equality index
country_level_df <- country_level_df %>%
left_join(gender_eq_index, by = "country_name")
## scaling: necessary for the regression analysis later on because the variables have vastly different scales
country_level_df %>%
select(where(is.numeric)) %>%
summary()
## v2x_libdem v2x_polyarchy v2x_gender v2x_egaldem
## Min. :0.3680 Min. :0.4720 Min. :0.8410 Min. :0.3340
## 1st Qu.:0.7143 1st Qu.:0.8105 1st Qu.:0.8908 1st Qu.:0.6977
## Median :0.7855 Median :0.8595 Median :0.9220 Median :0.7550
## Mean :0.7386 Mean :0.8236 Mean :0.9147 Mean :0.7195
## 3rd Qu.:0.8220 3rd Qu.:0.8808 3rd Qu.:0.9445 3rd Qu.:0.7917
## Max. :0.8840 Max. :0.9140 Max. :0.9590 Max. :0.8760
##
## v2x_liberal v2xcs_ccsi v2x_freexp Overall_score
## Min. :0.7060 Min. :0.5460 Min. :0.6470 Min. :6.380
## 1st Qu.:0.8718 1st Qu.:0.8395 1st Qu.:0.9113 1st Qu.:7.357
## Median :0.9230 Median :0.9125 Median :0.9440 Median :7.790
## Mean :0.8914 Mean :0.8844 Mean :0.9088 Mean :7.886
## 3rd Qu.:0.9380 3rd Qu.:0.9425 3rd Qu.:0.9702 3rd Qu.:8.568
## Max. :0.9810 Max. :0.9710 Max. :0.9840 Max. :9.390
##
## Electoral_process_and_pluralism Functioning_of_government
## Min. : 8.750 Min. :5.360
## 1st Qu.: 9.170 1st Qu.:6.340
## Median : 9.580 Median :7.320
## Mean : 9.493 Mean :7.373
## 3rd Qu.: 9.580 3rd Qu.:8.300
## Max. :10.000 Max. :9.640
##
## Political_participation Political_culture Civil_liberties gdp_2005
## Min. :5.000 Min. : 4.380 Min. : 7.060 Min. : 9602
## 1st Qu.:6.110 1st Qu.: 6.250 1st Qu.: 8.240 1st Qu.:16572
## Median :6.670 Median : 6.880 Median : 8.820 Median :26314
## Mean :6.885 Mean : 7.034 Mean : 8.656 Mean :26425
## 3rd Qu.:8.330 3rd Qu.: 8.130 3rd Qu.: 9.120 3rd Qu.:32867
## Max. :8.330 Max. :10.000 Max. :10.000 Max. :68708
##
## gdp_2018 gdp_growth rainbow_score_2019 rainbow_score_2018
## Min. : 24052 Min. : 19.15 Min. :16.83 Min. :16.07
## 1st Qu.: 32335 1st Qu.: 53.44 1st Qu.:25.69 1st Qu.:28.92
## Median : 41545 Median : 68.22 Median :44.77 Median :51.40
## Mean : 45761 Mean : 83.19 Mean :42.98 Mean :49.39
## 3rd Qu.: 52630 3rd Qu.:114.35 3rd Qu.:57.75 3rd Qu.:67.19
## Max. :116334 Max. :207.95 Max. :74.72 Max. :91.94
##
## rainbow_score_avg_2019_2018 rainbow_score_avg_2013_2014
## Min. :16.45 Min. :19.65
## 1st Qu.:26.89 1st Qu.:29.31
## Median :48.34 Median :40.52
## Mean :46.19 Mean :43.86
## 3rd Qu.:62.90 3rd Qu.:60.91
## Max. :83.33 Max. :79.38
##
## rainbow_score_difference Happiness_Score Unemployment Employment
## Min. :-10.270 Min. :4.933 Min. : 2.800 Min. :52.50
## 1st Qu.: -6.414 1st Qu.:5.848 1st Qu.: 4.115 1st Qu.:64.97
## Median : 0.185 Median :6.173 Median : 5.650 Median :67.40
## Mean : 2.325 Mean :6.337 Mean : 6.654 Mean :67.17
## 3rd Qu.: 5.274 3rd Qu.:6.938 3rd Qu.: 7.550 3rd Qu.:71.17
## Max. : 37.280 Max. :7.632 Max. :19.180 Max. :77.00
## NA's :1
## n_respondents n_valid lgbt_support_percent mean_religiosity
## Min. : 781 Min. : 781 Min. :30.85 Min. :3.401
## 1st Qu.:1529 1st Qu.:1529 1st Qu.:55.17 1st Qu.:4.490
## Median :1761 Median :1761 Median :75.34 Median :5.248
## Mean :1769 Mean :1769 Mean :70.21 Mean :5.343
## 3rd Qu.:2200 3rd Qu.:2200 3rd Qu.:88.25 3rd Qu.:6.029
## Max. :2745 Max. :2745 Max. :94.25 Max. :7.513
## NA's :4 NA's :4 NA's :4 NA's :4
## mean_left_right mean_equal_values mean_country_attach mean_eduyrs
## Min. :0.8134 Min. :1.718 Min. :6.974 Min. :10.97
## 1st Qu.:1.1858 1st Qu.:2.058 1st Qu.:7.977 1st Qu.:13.40
## Median :1.4188 Median :2.164 Median :8.116 Median :13.97
## Mean :1.7429 Mean :2.301 Mean :8.177 Mean :13.96
## 3rd Qu.:2.1136 3rd Qu.:2.529 3rd Qu.:8.484 3rd Qu.:14.43
## Max. :3.8986 Max. :3.271 Max. :9.341 Max. :16.15
## NA's :4 NA's :4 NA's :4 NA's :4
## mean_age pct_young pct_high_educ se_lgbt_support
## Min. :45.46 Min. :23.93 Min. :26.32 Min. :0.005644
## 1st Qu.:48.48 1st Qu.:26.64 1st Qu.:36.40 1st Qu.:0.007032
## Median :49.44 Median :28.34 Median :41.12 Median :0.009516
## Mean :52.31 Mean :28.25 Mean :39.98 Mean :0.010061
## 3rd Qu.:54.73 3rd Qu.:29.77 3rd Qu.:45.45 3rd Qu.:0.011830
## Max. :68.17 Max. :34.43 Max. :52.72 Max. :0.018152
## NA's :4 NA's :4 NA's :4 NA's :4
## pct_missing_lgbt age_lgbt_corr relig_lgbt_corr
## Min. :0 Min. :-0.253892 Min. :-0.27248
## 1st Qu.:0 1st Qu.:-0.183525 1st Qu.:-0.19308
## Median :0 Median :-0.132761 Median :-0.14838
## Mean :0 Mean :-0.132479 Mean :-0.14908
## 3rd Qu.:0 3rd Qu.:-0.084937 3rd Qu.:-0.11943
## Max. :0 Max. : 0.009616 Max. :-0.02086
## NA's :4 NA's :4 NA's :4
## lgbt_support_inequality educ_gradient gender_equality_index
## Min. :0.2294 Min. :0.01615 Min. :50.00
## 1st Qu.:0.3139 1st Qu.:0.04848 1st Qu.:55.55
## Median :0.4196 Median :0.07695 Median :60.10
## Mean :0.3966 Mean :0.08256 Mean :62.38
## 3rd Qu.:0.4836 3rd Qu.:0.12288 3rd Qu.:69.25
## Max. :0.5001 Max. :0.16813 Max. :82.60
## NA's :4 NA's :4 NA's :1
# we'll apply different scaling methods based on the type of variable:
# 1. z-score standardization: for continuous variables to make them comparable (mean=0, sd=1)
# 2. min-max normalization: for variables with natural bounds (e.g., 0-100 scores)
# 3. log transformation: for heavily skewed variables like GDP
# custom function to standardize (z-score)
standardize <- function(x) {
(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}
# custom function to min-max normalize
normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
# z-scores for the democracy scores
country_level_df <- country_level_df %>%
mutate(
z_v2x_libdem = standardize(v2x_libdem),
z_v2x_polyarchy = standardize(v2x_polyarchy),
z_v2x_gender = standardize(v2x_gender),
z_v2x_egaldem = standardize(v2x_egaldem),
z_v2x_liberal = standardize(v2x_liberal),
z_v2xcs_ccsi = standardize(v2xcs_ccsi),
z_v2x_freexp = standardize(v2x_freexp))
# z-scores for GDP and unemployment rate, natural log for GDP
country_level_df <- country_level_df %>%
mutate(
z_gdp_2018 = standardize(gdp_2018),
log_gdp_2018 = log(gdp_2018),
z_gdp_growth = standardize(gdp_growth),
z_unemployment = standardize(Unemployment))
# z-scores and normalised rainbow variables
country_level_df <- country_level_df %>%
mutate(
z_rainbow_score = standardize(rainbow_score_2019),
norm_rainbow_score = normalize(rainbow_score_2019),
z_lgbt_support = standardize(lgbt_support_percent),
norm_lgbt_support = normalize(lgbt_support_percent))
# z-socres and normalised for happiness scores and gender equality
country_level_df <- country_level_df %>%
mutate(
z_happiness = standardize(Happiness_Score),
z_gender_equality = standardize(gender_equality_index),
norm_gender_equality = normalize(gender_equality_index))
# religion, politics, education and age
country_level_df <- country_level_df %>%
mutate(
z_religiosity = standardize(mean_religiosity),
z_functioning_of_government = standardize(Functioning_of_government),
z_left_right = standardize(mean_left_right),
z_equal_values = standardize(mean_equal_values),
z_country_attach = standardize(mean_country_attach),
z_eduyrs = standardize(mean_eduyrs),
z_age = standardize(mean_age))
# create two composite scores and regional classification
country_level_df <- country_level_df %>%
mutate(
composite_equality = rowMeans(
cbind(z_gender_equality, z_rainbow_score, z_v2x_gender),
na.rm = TRUE))
country_level_df <- country_level_df %>%
mutate(
composite_democracy = rowMeans(
cbind(z_v2x_libdem, z_v2x_polyarchy, z_v2x_libdem, z_v2x_freexp),
na.rm = TRUE))
country_level_df <- country_level_df %>%
mutate(
region = case_when(
country_name %in% c("Denmark", "Finland", "Sweden", "Estonia", "Latvia", "Lithuania") ~
"Northern Europe",
country_name %in% c("Belgium", "Netherlands", "Luxembourg", "Germany", "France",
"Austria", "United Kingdom", "Ireland") ~
"Western Europe",
country_name %in% c("Portugal", "Spain", "Italy", "Malta", "Greece", "Cyprus") ~
"Southern Europe",
country_name %in% c("Poland", "Czech Republic", "Slovakia", "Hungary", "Slovenia",
"Croatia", "Romania", "Bulgaria") ~ "Eastern Europe",
TRUE ~ NA_character_))
# delete the "Country.x", "Country.y" and "CountryName" columns
country_level_df <- country_level_df %>%
select(-c("Country.x", "Country.y", "CountryName"))
# save as RDS and csv
saveRDS(country_level_df, file = "country_level_df.rds")
write.csv(country_level_df, "country_level_df.csv")
### data cleaning
library(tidyverse)
library(sjlabelled)
##
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
##
## as_factor
## The following object is masked from 'package:dplyr':
##
## as_label
## The following object is masked from 'package:ggplot2':
##
## as_label
library(haven)
##
## Attaching package: 'haven'
## The following objects are masked from 'package:sjlabelled':
##
## as_factor, read_sas, read_spss, read_stata, write_sas, zap_labels
library(mice)
data <- read_dta("data/raw/ZA7575.dta")
###
# code the questions properly:
# 'DK' and refusals are especially a problem for questions with ordinal variables,
# less so for non-ordinal variables; for reasons of completeness, we clean all of
# the 'DK' and refusals (and some other special cases) by converting them to NAs
data_correctly_coded <- data %>%
mutate(
# hard cases for me
d72_1 = ifelse(d72_1 %in% c(5,6), NA, d72_1),
d72_2 = ifelse(d72_2 %in% c(5,6), NA, d72_2),
d60 = ifelse(d60 == 7, NA, d60),
d25 = ifelse(d25 == 8, NA, d25),
d8 = ifelse(d8 > 70, NA, d8), # subjective decision that no one can have more than 70 years of ed (even full-time professors)
d7 = ifelse(d7 %in% c(15,97), NA, d7),
d1 = ifelse(d1 %in% c(97,98), NA, d1),
qa16_1 = ifelse(qa16_1 == 5, NA, qa16_1),
qa16_2 = ifelse(qa16_2 == 5, NA, qa16_2),
qa16_3 = ifelse(qa16_3 == 5, NA, qa16_3),
qa16_4 = ifelse(qa16_4 == 5, NA, qa16_4),
d71_1 = ifelse(d71_1 == 4, NA, d71_1),
d71_2 = ifelse(d71_2 == 4, NA, d71_2),
d71_3 = ifelse(d71_3 == 4, NA, d71_3),
#qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
#qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
#qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
#qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb3_1 = ifelse(qb3_1 == 5, NA, qb3_1),
qb3_2 = ifelse(qb3_2 == 5, NA, qb3_2),
qb3_3 = ifelse(qb3_3 == 5, NA, qb3_3),
qb3_4 = ifelse(qb3_4 == 5, NA, qb3_4),
qb3_5 = ifelse(qb3_5 == 5, NA, qb3_5),
qb3_6 = ifelse(qb3_6 == 5, NA, qb3_6),
qb3_7 = ifelse(qb3_7 == 5, NA, qb3_7),
qb4_1 = ifelse(qb4_1 == 5, NA, qb4_1),
qb4_2 = ifelse(qb4_2 == 5, NA, qb4_2),
qb4_3 = ifelse(qb4_3 == 5, NA, qb4_3),
qb4_4 = ifelse(qb4_4 == 5, NA, qb4_4),
qb4_5 = ifelse(qb4_5 == 5, NA, qb4_5),
qb5_1 = ifelse(qb5_1 == 5, NA, qb5_1),
qb5_2 = ifelse(qb5_2 == 5, NA, qb5_2),
qb5_3 = ifelse(qb5_3 == 5, NA, qb5_3),
qb5_4 = ifelse(qb5_4 == 5, NA, qb5_4),
sd1_1 = ifelse(sd1_1 %in% c(3,4), NA, sd1_1),
sd1_2 = ifelse(sd1_2 %in% c(3,4), NA, sd1_2),
sd1_3 = ifelse(sd1_3 %in% c(3,4), NA, sd1_3),
sd1_4 = ifelse(sd1_4 %in% c(3,4), NA, sd1_4),
sd1_5 = ifelse(sd1_5 %in% c(3,4), NA, sd1_5),
sd1_6 = ifelse(sd1_6 %in% c(3,4), NA, sd1_6),
sd1_7 = ifelse(sd1_7 %in% c(3,4), NA, sd1_7),
sd1_8 = ifelse(sd1_8 %in% c(3,4), NA, sd1_8),
qc1_1 = ifelse(qc1_1 == 6, NA, qc1_1),
qc1_2 = ifelse(qc1_2 == 6, NA, qc1_2),
qc1_3 = ifelse(qc1_3 == 6, NA, qc1_3),
qc1_4 = ifelse(qc1_4 == 6, NA, qc1_4),
qc1_5 = ifelse(qc1_5 == 6, NA, qc1_5),
qc1_6 = ifelse(qc1_6 == 6, NA, qc1_6),
qc1_7 = ifelse(qc1_7 == 6, NA, qc1_7),
qc1_8 = ifelse(qc1_8 == 6, NA, qc1_8),
qc1_9 = ifelse(qc1_9 == 6, NA, qc1_9),
qc1_10 = ifelse(qc1_10 == 6, NA, qc1_10),
qc5_1 = ifelse(qc5_1 == 3, NA, qc5_1),
qc5_2 = ifelse(qc5_2 == 3, NA, qc5_2),
qc5_3 = ifelse(qc5_3 == 3, NA, qc5_3),
qc5_4 = ifelse(qc5_4 == 3, NA, qc5_4),
qc6_1 = ifelse(qc6_1 == 12, NA, qc6_1),
qc6_2 = ifelse(qc6_2 == 12, NA, qc6_2),
qc6_3 = ifelse(qc6_3 == 12, NA, qc6_3),
qc6_4 = ifelse(qc6_4 == 12, NA, qc6_4),
qc6_5 = ifelse(qc6_5 == 12, NA, qc6_5),
qc6_6 = ifelse(qc6_6 == 12, NA, qc6_6),
qc6_7 = ifelse(qc6_7 == 12, NA, qc6_7),
qc6_8 = ifelse(qc6_8 == 12, NA, qc6_8),
qc6_9 = ifelse(qc6_9 == 12, NA, qc6_9),
qc6_10 = ifelse(qc6_10 == 12, NA, qc6_10),
qc6_11 = ifelse(qc6_11 == 12, NA, qc6_11),
qc9_1 = ifelse(qc9_1 == 7, NA, qc9_1),
qc9_2 = ifelse(qc9_2 == 7, NA, qc9_2),
qc9_3 = ifelse(qc9_3 == 7, NA, qc9_3),
qc9_4 = ifelse(qc9_4 == 7, NA, qc9_4),
qc9_5 = ifelse(qc9_5 == 7, NA, qc9_5),
qc9_6 = ifelse(qc9_6 == 7, NA, qc9_6),
qc9_7 = ifelse(qc9_7 == 7, NA, qc9_7),
qc9_8 = ifelse(qc9_8 == 7, NA, qc9_8),
qc9_9 = ifelse(qc9_9 == 7, NA, qc9_9),
qc9_10 = ifelse(qc9_10 == 7, NA, qc9_10),
qc9_11 = ifelse(qc9_11 == 7, NA, qc9_11),
qc11_1 = ifelse(qc11_1 == 6, NA, qc11_1),
qc11_2 = ifelse(qc11_2 == 6, NA, qc11_2),
qc11_3 = ifelse(qc11_3 == 6, NA, qc11_3),
qc11_4 = ifelse(qc11_4 == 6, NA, qc11_4),
qc11_5 = ifelse(qc11_5 == 6, NA, qc11_5),
qc11_6 = ifelse(qc11_6 == 6, NA, qc11_6),
qc12_1 = ifelse(qc12_1 %in% c(11,12,13), NA, qc12_1),
qc12_2 = ifelse(qc12_2 %in% c(11,12,13), NA, qc12_2),
qc12_3 = ifelse(qc12_3 %in% c(11,12,13), NA, qc12_3),
qc12_4 = ifelse(qc12_4 %in% c(11,12,13), NA, qc12_4),
qc12_5 = ifelse(qc12_5 %in% c(11,12,13), NA, qc12_5),
qc12_6 = ifelse(qc12_6 %in% c(11,12,13), NA, qc12_6),
qc12_7 = ifelse(qc12_7 %in% c(11,12,13), NA, qc12_7),
qc12_8 = ifelse(qc12_8 %in% c(11,12,13), NA, qc12_8),
qc12_9 = ifelse(qc12_9 %in% c(11,12,13), NA, qc12_9),
qc12_10 = ifelse(qc12_10 %in% c(11,12,13), NA, qc12_10),
qc12_11 = ifelse(qc12_11 %in% c(11,12,13), NA, qc12_11),
qc12_12 = ifelse(qc12_12 %in% c(11,12,13), NA, qc12_12),
qc12_13 = ifelse(qc12_13 %in% c(11,12,13), NA, qc12_13),
qc12_14 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_14),
qc12_15 = ifelse(qc12_14 %in% c(11,12,13), NA, qc12_15),
qc13_1 = ifelse(qc13_1 %in% c(11,12,13), NA, qc13_1),
qc13_2 = ifelse(qc13_2 %in% c(11,12,13), NA, qc13_2),
qc13_3 = ifelse(qc13_3 %in% c(11,12,13), NA, qc13_3),
qc13_4 = ifelse(qc13_4 %in% c(11,12,13), NA, qc13_4),
qc13_5 = ifelse(qc13_5 %in% c(11,12,13), NA, qc13_5),
qc13_6 = ifelse(qc13_6 %in% c(11,12,13), NA, qc13_6),
qc13_7 = ifelse(qc13_7 %in% c(11,12,13), NA, qc13_7),
qc13_8 = ifelse(qc13_8 %in% c(11,12,13), NA, qc13_8),
qc13_9 = ifelse(qc13_9 %in% c(11,12,13), NA, qc13_9),
qc13_10 = ifelse(qc13_10 %in% c(11,12,13), NA, qc13_10),
qc13_11 = ifelse(qc13_11 %in% c(11,12,13), NA, qc13_11),
qc13_12 = ifelse(qc13_12 %in% c(11,12,13), NA, qc13_12),
qc13_13 = ifelse(qc13_13 %in% c(11,12,13), NA, qc13_13),
qc13_14 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_14),
qc13_15 = ifelse(qc13_14 %in% c(11,12,13), NA, qc13_15),
qc15_1 = ifelse(qc15_1 == 5, NA, qc15_1),
qc15_2 = ifelse(qc15_2 == 5, NA, qc15_2),
qc15_3 = ifelse(qc15_3 == 5, NA, qc15_3),
qc16_1 = ifelse(qc16_1 == 5, NA, qc16_1),
qc17_1 = ifelse(qc17_1 == 5, NA, qc17_1),
qc17_2 = ifelse(qc17_2 == 5, NA, qc17_2),
qc17_3 = ifelse(qc17_3 == 5, NA, qc17_3),
qc17_4 = ifelse(qc17_4 == 5, NA, qc17_4),
qc17_5 = ifelse(qc17_5 == 5, NA, qc17_5),
qc17_6 = ifelse(qc17_6 == 5, NA, qc17_6),
qc17_7 = ifelse(qc17_7 == 5, NA, qc17_7),
qc18_1 = ifelse(qc18_1 %in% c(11,12), NA, qc18_1),
qc18_2 = ifelse(qc18_2 %in% c(11,12), NA, qc18_2),
qc18_3 = ifelse(qc18_3 %in% c(11,12), NA, qc18_3),
sd3 = ifelse(sd3 %in% c(15,16), NA, sd3),
# easy cases for Claude
#q1 = ifelse(q1 %in% c(29,30), NA, q1),
qa1 = ifelse(qa1 == 5, NA, qa1),
qa7 = ifelse(qa7 == 5, NA, qa7),
qa8 = ifelse(qa8 == 4, NA, qa8),
qa9 = ifelse(qa9 == 6, NA, qa9),
qa14 = ifelse(qa14 == 6, NA, qa14),
qa17 = ifelse(qa17 == 5, NA, qa17),
qb6 = ifelse(qb6 == 4, NA, qb6),
qb7 = ifelse(qb7 == 5, NA, qb7),
qc19 = ifelse(qc19 == 3, NA, qc19),
qc20 = ifelse(qc20 == 3, NA, qc20),
d63 = ifelse(d63 %in% c(6,7,8,9), NA, d63), # manually because stupid Claude
d77 = ifelse(d77 == 5, NA, d77),
d70 = ifelse(d70 == 5, NA, d70),
qa4a = ifelse(qa4a %in% c(7,8), NA, qa4a),
qa5a = ifelse(qa5a %in% c(11,12,13), NA, qa5a),
qa11 = ifelse(qa11 %in% c(4,5), NA, qa11),
qa12 = ifelse(qa12 %in% c(5,6), NA, qa12),
qa13 = ifelse(qa13 %in% c(6,7), NA, qa13),
qa18a = ifelse(qa18a %in% c(7,8,9), NA, qa18a),
qb8 = ifelse(qb8 == 5, NA, qb8),
sd3 = ifelse(sd3 == 16, NA, sd3),
qc3 = ifelse(qc3 %in% c(10,11), NA, qc3),
qc7 = ifelse(qc7 %in% c(11,12), NA, qc7),
qc8 = ifelse(qc8 %in% c(11,12), NA, qc8),
qc10 = ifelse(qc10 %in% c(9,10), NA, qc10))
# write_rds(data_correctly_coded, file = "data_correctly_coded.rds")
###
# get rid of all the "r" coded ones
# for the questions with underscores, (1) if they are on a scale, e.g., 1-3 and 'DK'
# is one of them, replace 'DK'; if it's 0-1 coding, keep that (can't do anything about it)
data_correctly_coded <- data_correctly_coded %>%
select(-matches("_r$|_r[0-9]$|_r[0-9]_"))
# use setdiff() to check whether it's done properly
###
# now analyse the patterns of NAs across columns
missing_data_before <- data_correctly_coded %>%
summarise(across(everything(), ~sum(is.na(.))/n())) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_proportion") %>%
arrange(desc(missing_proportion))
# view top variables with missing data
print(head(missing_data_before, 20))
## # A tibble: 20 × 2
## variable missing_proportion
## <chr> <dbl>
## 1 p6mt 0.982
## 2 p13mt 0.982
## 3 p6cy 0.982
## 4 p7cy 0.982
## 5 p6lu 0.981
## 6 p7lu 0.981
## 7 p13lu 0.981
## 8 p6hr 0.964
## 9 p7hr 0.964
## 10 p6ee 0.963
## 11 p6fi 0.963
## 12 p6lt 0.963
## 13 p7ee 0.963
## 14 p7fi 0.963
## 15 p7lt 0.963
## 16 p13ee 0.963
## 17 p13fi 0.963
## 18 p6dk 0.963
## 19 p7dk 0.963
## 20 p6es 0.963
# viz
ggplot(missing_data_before %>% filter(missing_proportion > 0.25),
aes(x = reorder(variable, missing_proportion), y = missing_proportion)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Variables with >25% Missing Data",
x = "Variable",
y = "Proportion Missing") +
theme_minimal()
# calculate overall proportion of missing data
mean(missing_data_before$missing_proportion)
## [1] 0.1424237
# deeper analysis
missing_by_prefix <- missing_data_before %>%
mutate(prefix = str_extract(variable, "^[a-z]+\\d+")) %>%
group_by(prefix) %>%
summarise(
avg_missing = mean(missing_proportion),
n_vars = n(),
max_missing = max(missing_proportion),
min_missing = min(missing_proportion)
) %>%
arrange(desc(avg_missing))
print(head(missing_by_prefix, 15))
## # A tibble: 15 × 5
## prefix avg_missing n_vars max_missing min_missing
## <chr> <dbl> <int> <dbl> <dbl>
## 1 p13 0.945 8 0.982 0.779
## 2 p6 0.931 29 0.982 0
## 3 p7 0.930 28 0.982 0.0180
## 4 qc3 0.861 1 0.861 0.861
## 5 qa3 0.679 7 0.679 0.679
## 6 qa15 0.627 5 0.627 0.627
## 7 qa2 0.376 8 0.376 0.376
## 8 d15 0.261 2 0.522 0
## 9 qb7 0.208 1 0.208 0.208
## 10 d40 0.157 5 0.784 0
## 11 qa13 0.124 1 0.124 0.124
## 12 qc19 0.120 1 0.120 0.120
## 13 qc16 0.117 1 0.117 0.117
## 14 qc20 0.116 1 0.116 0.116
## 15 qc10 0.109 1 0.109 0.109
# we take out the following variables because (1) they exhibited high missingness
# while at the same aren't super releveant (that's an assumption we make) for our
# subsequent analysis
data_correctly_coded <- data_correctly_coded %>%
select(
# Exclude variables with specific prefixes
-starts_with("p6"), # Size of locality
-starts_with("p7"), # Region
-starts_with("p13"), # Language of interview
-starts_with("d40"), # Household size
-starts_with("qa3"), # Not benefitting from trade
-starts_with("qa15"), # Countries bought goods from
-starts_with("qa2"), # Benefitting from trade
-starts_with("qb8"), # EU energy label influence
-starts_with("qa18b"), # Information sources follow-up
-starts_with("qc3") # Discrimination circumstances
)
# check how much we imporved the missingness rate
missing_data_after <- data_correctly_coded %>%
summarise(across(everything(), ~sum(is.na(.))/n())) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "missing_proportion") %>%
arrange(desc(missing_proportion))
mean(missing_data_after$missing_proportion)
## [1] 0.02018962
# prepare for imputation
data_correctly_coded <- data_correctly_coded %>%
mutate(isocntry = as.factor(isocntry)) %>%
# remove other problematic variables (like IDs) if needed
select(!any_of(c("doi", "version", "caseid", "uniqid", "serialid"))) %>%
sjlabelled::remove_all_labels()
# delete variables with zero variance as they cannot ever be useful predictors
var_zero <- data_correctly_coded %>%
select(where(is.numeric)) %>%
summarise(across(everything(), var, na.rm = TRUE)) %>%
pivot_longer(everything(), names_to = "variable", values_to = "variance") %>%
filter(variance == 0 | is.na(variance))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(everything(), var, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
if(nrow(var_zero) > 0) {
cat("Removing", nrow(var_zero), "variables with zero variance\n")
data_correctly_coded <- data_correctly_coded %>%
select(!any_of(var_zero$variable))
}
## Removing 6 variables with zero variance
data_correctly_coded <- data_correctly_coded %>%
# convert country codes: collapse East and West Germany into one DE
mutate(isocntry = case_when(
isocntry %in% c("DE-W", "DE-E") ~ "DE",
TRUE ~ isocntry))
###
# run the imputation
set.seed(1212)
start_time <- Sys.time()
#imp_model <- mice(data_correctly_coded, m = 3, method = 'rf', maxit = 3)
end_time <- Sys.time()
end_time - start_time
## Time difference of 0.0007889271 secs
#df_rf_new <- complete(imp_model, action = 3)
#write_rds(df_rf_new, "df_rf_new.rds")
## why still z-scores for mean religosity, left-right etc.
country_level_df <- readRDS("country_level_df.rds")
# first, let's check which variables have missing values
missing_summary <- colSums(is.na(country_level_df))
missing_summary <- missing_summary[missing_summary > 0]
print(missing_summary)
## Happiness_Score n_respondents n_valid
## 1 4 4
## lgbt_support_percent mean_religiosity mean_left_right
## 4 4 4
## mean_equal_values mean_country_attach mean_eduyrs
## 4 4 4
## mean_age pct_young pct_high_educ
## 4 4 4
## se_lgbt_support pct_missing_lgbt age_lgbt_corr
## 4 4 4
## relig_lgbt_corr lgbt_support_inequality educ_gradient
## 4 4 4
## gender_equality_index z_lgbt_support norm_lgbt_support
## 1 4 4
## z_happiness z_gender_equality norm_gender_equality
## 1 1 1
## z_religiosity z_left_right z_equal_values
## 4 4 4
## z_country_attach z_eduyrs z_age
## 4 4 4
# Visualize missing data patterns
vis_miss(country_level_df)
# we have the following imputation strategy:
# (1) we will have to delete the variables from the ESS because the Eurobarometer
# survey contains similar information and imputing these values didn't seem appropriate
# (2) we chose to impute the happiness score using KNN
# (1)
# delete ESS variables
ESS_variables <- c(
# original survey variables
"n_respondents", "n_valid", "lgbt_support_percent", "mean_religiosity",
"mean_left_right", "mean_equal_values", "mean_country_attach",
"mean_eduyrs", "mean_age", "pct_young", "pct_high_educ", "se_lgbt_support",
# additional ESS metrics
"pct_missing_lgbt", "age_lgbt_corr", "relig_lgbt_corr",
"lgbt_support_inequality", "educ_gradient",
# z-score transformations of ESS variables
"z_religiosity", "z_left_right", "z_equal_values",
"z_country_attach", "z_eduyrs", "z_age")
country_level_df <- country_level_df %>%
select(-all_of(ESS_variables))
# (2)
# create separate datasets for variables that need imputation
missing_variables <- names(missing_summary)[missing_summary < 5 &
!(names(missing_summary) %in% ESS_variables)]
# for our KNN imputation, we need to identify variables to use as predictors
# these should be variables without NAs that also correlate with those we want to impute
# identify complete variables that could serve as predictors
complete_variables <- names(country_level_df)[colSums(is.na(country_level_df)) == 0]
complete_variables <- complete_variables[!complete_variables %in% c("Unnamed: 0", "country_name", "iso2", "region")]
print(complete_variables)
## [1] "v2x_libdem" "v2x_polyarchy"
## [3] "v2x_gender" "v2x_egaldem"
## [5] "v2x_liberal" "v2xcs_ccsi"
## [7] "v2x_freexp" "Overall_score"
## [9] "Electoral_process_and_pluralism" "Functioning_of_government"
## [11] "Political_participation" "Political_culture"
## [13] "Civil_liberties" "Regime_type"
## [15] "gdp_2005" "gdp_2018"
## [17] "gdp_growth" "rainbow_score_2019"
## [19] "rainbow_score_2018" "rainbow_score_avg_2019_2018"
## [21] "rainbow_score_avg_2013_2014" "rainbow_score_difference"
## [23] "Unemployment" "Employment"
## [25] "z_v2x_libdem" "z_v2x_polyarchy"
## [27] "z_v2x_gender" "z_v2x_egaldem"
## [29] "z_v2x_liberal" "z_v2xcs_ccsi"
## [31] "z_v2x_freexp" "z_gdp_2018"
## [33] "log_gdp_2018" "z_gdp_growth"
## [35] "z_unemployment" "z_rainbow_score"
## [37] "norm_rainbow_score" "z_functioning_of_government"
## [39] "composite_equality" "composite_democracy"
# custom function to impute variables with KNN, handling non-numeric data
knn_impute <- function(data, vars_to_impute, predictor_vars, k = 5) {
# create a copy of the data
imputed_data <- data
# ensure predictor variables are numeric and complete
pred_data <- data[, predictor_vars, drop = FALSE]
pred_data <- as.data.frame(lapply(pred_data, function(x) as.numeric(x)))
# remove any non-numeric columns
numeric_cols <- sapply(pred_data, is.numeric)
if(any(!numeric_cols)) {
warning("Removing non-numeric predictor columns: ",
paste(names(pred_data)[!numeric_cols], collapse=", "))
pred_data <- pred_data[, numeric_cols, drop=FALSE]
predictor_vars <- predictor_vars[numeric_cols]
}
# handle missing values in predictor variables by using column means
for(col in names(pred_data)) {
if(any(is.na(pred_data[[col]]))) {
pred_data[[col]][is.na(pred_data[[col]])] <- mean(pred_data[[col]], na.rm=TRUE)
}
}
# standardize the predictor variables manually to avoid issues
pred_data_std <- as.data.frame(scale(pred_data))
# for each variable to impute:
for (var in vars_to_impute) {
if (var %in% names(data) && sum(is.na(data[[var]])) > 0) {
# ensure the target variable is numeric
if (!is.numeric(data[[var]])) {
cat("Skipping", var, "as it is not numeric\n")
next
}
# identify cases with missing values
missing_indices <- which(is.na(data[[var]]))
# for each missing value:
for (idx in missing_indices) {
# calculate Euclidean distances to all other countries
distances <- numeric(nrow(data))
for (i in 1:nrow(data)) {
if (i != idx) {
# calculate squared differences for each predictor
squared_diffs <- sapply(names(pred_data_std), function(p) {
(pred_data_std[idx, p] - pred_data_std[i, p])^2
})
# sum and take square root for Euclidean distance
distances[i] <- sqrt(sum(squared_diffs, na.rm=TRUE))
} else {
distances[i] <- Inf # don't use the country itself
}
}
# find k nearest neighbors with non-missing values for this variable
valid_neighbors <- which(!is.na(data[[var]]) & !is.infinite(distances))
if (length(valid_neighbors) > 0) {
# order the neighbors by distance
neighbor_order <- order(distances[valid_neighbors])
valid_neighbors <- valid_neighbors[neighbor_order]
# take the k nearest, or as many as available if fewer than k
k_actual <- min(k, length(valid_neighbors))
nearest_k <- valid_neighbors[1:k_actual]
# impute as the average of the k nearest neighbors
imputed_data[idx, var] <- mean(data[nearest_k, var], na.rm = TRUE)
cat("Imputed", var, "for", data$country_name[idx],
"using neighbors:", paste(data$country_name[nearest_k], collapse=", "), "\n")
} else {
cat("Warning: Could not impute", var, "for", data$country_name[idx],
"- no valid neighbors found\n")
}
}
}
}
return(imputed_data)
}
# now use the function to impute variables with missing values
# first ensure we have the right variable types
country_df_numeric <- country_level_df
for (var in missing_variables) {
if (var %in% names(country_level_df) && !is.numeric(country_level_df[[var]])) {
country_df_numeric[[var]] <- as.numeric(as.character(country_level_df[[var]]))
}
}
# run the imputation with proper error handling
tryCatch({
country_df_imputed <- knn_impute(
data = country_df_numeric,
vars_to_impute = missing_variables,
predictor_vars = complete_variables,
k = 3 # using 3 nearest neighbors
)
print("Imputation completed successfully!")
}, error = function(e) {
# if the knn_impute function still fails, let's try an alternative approach
print(paste("KNN imputation error:", e$message))
print("Switching to a simpler mean imputation approach...")
country_df_imputed <- country_df_numeric
for (var in missing_variables) {
if (sum(is.na(country_df_imputed[[var]])) > 0) {
# Calculate mean of non-missing values
var_mean <- mean(country_df_imputed[[var]], na.rm = TRUE)
# Replace missing values with mean
country_df_imputed[[var]][is.na(country_df_imputed[[var]])] <- var_mean
print(paste("Imputed", var, "with mean value:", var_mean))
}
}
return(country_df_imputed)
})
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Imputed Happiness_Score for Belgium using neighbors: France, Portugal, Denmark
## Imputed gender_equality_index for United Kingdom using neighbors: Netherlands, Austria, France
## Imputed z_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany
## Imputed z_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy
## Imputed z_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria
## Imputed z_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia
## Imputed norm_lgbt_support for Luxembourg using neighbors: Ireland, Austria, Germany
## Imputed norm_lgbt_support for Greece using neighbors: Spain, Slovenia, Italy
## Imputed norm_lgbt_support for Malta using neighbors: United Kingdom, Croatia, Austria
## Imputed norm_lgbt_support for Romania using neighbors: Poland, Bulgaria, Croatia
## Imputed z_happiness for Belgium using neighbors: France, Portugal, Denmark
## Imputed z_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France
## Imputed norm_gender_equality for United Kingdom using neighbors: Netherlands, Austria, France
## [1] "Imputation completed successfully!"
# save
saveRDS(country_df_imputed, file = "country_df_imputed.rds")
write.csv(country_df_imputed, "country_df_imputed.csv", row.names = FALSE)
# libraries
library(dplyr)
library(tidyr)
library(readr)
# load the data
df_rf_new <- read_csv("df_rf_new.csv") # the imputed dataset
## New names:
## Rows: 27438 Columns: 475
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): isocntry, nuts dbl (473): ...1, tnscntry, country, d11, d11r1, d11r2,
## d11r3, gen1, gen2, ge...
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
country_df_imputed <- readRDS("country_df_imputed.rds") # external dataset
# custom function to calculate country-level aggregates
calculate_country_aggregates <- function(df) {
country_agg <- df %>%
group_by(isocntry) %>%
summarize(
iso2 = first(isocntry),
# demographics
mean_age = mean(d11, na.rm = TRUE),
median_age = median(d11, na.rm = TRUE),
# life satisfaction (recoded)
mean_life_satisfaction = mean(5 - d70, na.rm = TRUE),
pct_satisfied = mean(d70 %in% c(1, 2), na.rm = TRUE) * 100,
# political discussions
mean_natl_political_discuss = mean(d71_1, na.rm = TRUE),
mean_eu_political_discuss = mean(d71_2, na.rm = TRUE),
mean_local_political_discuss = mean(d71_3, na.rm = TRUE),
pct_discuss_natl_politics = mean(d71_1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_discuss_eu_politics = mean(d71_2 %in% c(1, 2), na.rm = TRUE) * 100,
pct_discuss_local_politics = mean(d71_3 %in% c(1, 2), na.rm = TRUE) * 100,
# trade and globalization (recoded)
mean_trade_support = mean(5 - qa1, na.rm = TRUE),
pct_trade_support = mean(qa1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_pro_globalization = mean(qa5a %in% c(1, 2, 3, 5, 7, 8), na.rm = TRUE) * 100,
pct_anti_globalization = mean(qa5a %in% c(4, 6, 9, 10), na.rm = TRUE) * 100,
mean_eu_trade_support = mean(5 - qa7, na.rm = TRUE),
pct_support_eu_trade = mean(qa7 %in% c(1, 2), na.rm = TRUE) * 100,
mean_esg_support = mean(5 - qa9, na.rm = TRUE),
pct_support_esg = mean(qa9 %in% c(1, 2), na.rm = TRUE) * 100,
# political and social indicators
mean_left_right = mean(d1, na.rm = TRUE),
pct_left = mean(d1 %in% c(1, 2, 3, 4), na.rm = TRUE) * 100,
pct_center = mean(d1 %in% c(5, 6), na.rm = TRUE) * 100,
pct_right = mean(d1 %in% c(7, 8, 9, 10), na.rm = TRUE) * 100,
mean_education_years = mean(d8, na.rm = TRUE),
median_education_years = median(d8, na.rm = TRUE),
pct_high_education = mean(d8 >= 20, na.rm = TRUE) * 100,
pct_rural = mean(d25 == 1, na.rm = TRUE) * 100,
pct_urban = mean(d25 %in% c(2, 3), na.rm = TRUE) * 100,
pct_large_urban = mean(d25 == 3, na.rm = TRUE) * 100,
mean_financial_difficulty = mean(4 - d60, na.rm = TRUE),
pct_financial_difficulty = mean(d60 %in% c(1, 2), na.rm = TRUE) * 100,
mean_subjective_class = mean(d63, na.rm = TRUE),
pct_working_class = mean(d63 == 1, na.rm = TRUE) * 100,
pct_middle_class = mean(d63 %in% c(2, 3, 4), na.rm = TRUE) * 100,
pct_upper_class = mean(d63 == 5, na.rm = TRUE) * 100,
mean_voice_in_eu = mean(5 - d72_1, na.rm = TRUE),
mean_voice_in_country = mean(5 - d72_2, na.rm = TRUE),
pct_voice_in_eu = mean(d72_1 %in% c(1, 2), na.rm = TRUE) * 100,
pct_voice_in_country = mean(d72_2 %in% c(1, 2), na.rm = TRUE) * 100,
# diversity indicators
pct_friends_diff_ethnic = mean(sd1_1 == 1, na.rm = TRUE) * 100,
pct_friends_diff_skin = mean(sd1_2 == 1, na.rm = TRUE) * 100,
pct_friends_roma = mean(sd1_3 == 1, na.rm = TRUE) * 100,
pct_friends_lgbt = mean(sd1_4 == 1, na.rm = TRUE) * 100,
pct_friends_disabled = mean(sd1_5 == 1, na.rm = TRUE) * 100,
pct_friends_diff_religion = mean(sd1_6 == 1, na.rm = TRUE) * 100,
pct_friends_transgender = mean(sd1_7 == 1, na.rm = TRUE) * 100,
pct_friends_intersex = mean(sd1_8 == 1, na.rm = TRUE) * 100,
pct_ethnic_minority = mean(sd2_1 == 1, na.rm = TRUE) * 100,
pct_skin_minority = mean(sd2_2 == 1, na.rm = TRUE) * 100,
pct_religious_minority = mean(sd2_3 == 1, na.rm = TRUE) * 100,
pct_roma = mean(sd2_4 == 1, na.rm = TRUE) * 100,
pct_sexual_minority = mean(sd2_5 == 1, na.rm = TRUE) * 100,
pct_disability = mean(sd2_6 == 1, na.rm = TRUE) * 100,
pct_other_minority = mean(sd2_7 == 1, na.rm = TRUE) * 100,
pct_any_minority = mean(sd2_1 == 1 | sd2_2 == 1 | sd2_3 == 1 | sd2_4 == 1 |
sd2_5 == 1 | sd2_6 == 1 | sd2_7 == 1, na.rm = TRUE) * 100,
# religion
pct_catholic = mean(sd3 == 1, na.rm = TRUE) * 100,
pct_orthodox = mean(sd3 == 2, na.rm = TRUE) * 100,
pct_protestant = mean(sd3 == 3, na.rm = TRUE) * 100,
pct_other_christian = mean(sd3 == 4, na.rm = TRUE) * 100,
pct_jewish = mean(sd3 == 5, na.rm = TRUE) * 100,
pct_muslim = mean(sd3 %in% c(6, 7, 8), na.rm = TRUE) * 100,
pct_atheist = mean(sd3 == 13, na.rm = TRUE) * 100,
pct_nonbeliever = mean(sd3 == 14, na.rm = TRUE) * 100,
pct_nonreligious = mean(sd3 %in% c(13, 14), na.rm = TRUE) * 100,
# subjective discrimination perception (recoded)
mean_discrim_ethnic = mean(5 - qc1_1, na.rm = TRUE),
mean_discrim_skin = mean(5 - qc1_2, na.rm = TRUE),
mean_discrim_roma = mean(5 - qc1_3, na.rm = TRUE),
mean_discrim_lgbt = mean(5 - qc1_4, na.rm = TRUE),
mean_discrim_age = mean(5 - qc1_5, na.rm = TRUE),
mean_discrim_religion = mean(5 - qc1_6, na.rm = TRUE),
mean_discrim_disability = mean(5 - qc1_7, na.rm = TRUE),
mean_discrim_transgender = mean(5 - qc1_8, na.rm = TRUE),
mean_discrim_gender = mean(5 - qc1_9, na.rm = TRUE),
mean_discrim_intersex = mean(5 - qc1_10, na.rm = TRUE),
)
return(country_agg)
}
# apply the function
country_aggregates <- calculate_country_aggregates(df_rf_new)
# join
country_data_combined <- country_df_imputed %>%
left_join(country_aggregates, by = "iso2")
# while it's less relevant for a ML model such as random forest, for a multi-level
# regression model, it's important to standardize the values to allow for better
# coefficient interpretation; the goal is to have both z-scores and 0-1 normalised
# values for each of the original variables (except for variables like 'region')
country_data_combined <- country_data_combined %>%
# first handle all numeric variables that aren't already standardized
mutate(
# z-score standardization
across(where(is.numeric) &
!starts_with("z_") &
!starts_with("norm_") &
!matches("country|iso"),
list(z = ~scale(.)[,1]),
.names = "z_{.col}"),
# also create 0-1 normalized versions
across(where(is.numeric) &
!starts_with("z_") &
!starts_with("norm_") &
!matches("country|iso"),
list(norm = ~(.-min(., na.rm=TRUE))/(max(., na.rm=TRUE)-min(., na.rm=TRUE))),
.names = "norm_{.col}"))
# save the combined dataset
saveRDS(country_data_combined, file = "country_data_combined.rds")
write_csv(country_data_combined, "country_data_combined.csv")
# join with df_rf; the actual, imputed survey data
df_rf_enriched_new <- df_rf_new %>%
# ISO codes as key
left_join(country_data_combined, by = c("isocntry" = "iso2"))
# save final dataset
write_csv(df_rf_enriched_new, "df_rf_enriched_new.csv")
# load libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(viridis)
## Loading required package: viridisLite
##
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
##
## viridis_pal
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
country_level_df <- readRDS("country_level_df.rds")
# plot 1: employment and unemployment Rates
p_1 <- ggplot(country_level_df, aes(x = reorder(country_name, Employment))) +
geom_bar(aes(y = Employment), stat = "identity", fill = "steelblue", alpha = 0.7) +
geom_point(aes(y = Unemployment), color = "darkred", size = 3) +
scale_y_continuous(
name = "Employment Rate (%)",
limits = c(0, 100),
sec.axis = sec_axis(~., name = "Unemployment Rate (%)")
) +
labs(title = "Employment and Unemployment Rates by Country (2018)",
subtitle = "Bars: Employment rate | Points: Unemployment rate",
x = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y.left = element_text(color = "steelblue"),
axis.title.y.right = element_text(color = "darkred"))
# plot 2: GDP per capita and GDP per capita growth rate
p_2 <- ggplot(country_level_df, aes(x = reorder(country_name, gdp_2018))) +
geom_bar(aes(y = gdp_2018), stat = "identity", fill = "steelblue", alpha = 0.7) +
geom_point(aes(y = gdp_growth * 500), color = "darkred", size = 3) + # Scale growth to fit on same axis
scale_y_continuous(
name = "GDP per Capita (USD)",
labels = comma,
sec.axis = sec_axis(~./500, name = "GDP Growth 2005-2018 (%)")
) +
labs(title = "GDP per Capita (2018) and Growth Rate",
subtitle = "Bars: GDP per capita | Points: Growth rate",
x = NULL) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
panel.grid.major.x = element_blank(),
panel.grid.minor = element_blank(),
axis.title.y.left = element_text(color = "steelblue"),
axis.title.y.right = element_text(color = "darkred"))
# plot 3: relationship between employment and LGBT support
p_3 <- ggplot(country_level_df, aes(x = Employment, y = lgbt_support_percent)) +
geom_point(aes(color = Unemployment, size = rainbow_score_2019)) +
geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
scale_color_viridis_c(option = "B", name = "Unemployment\nRate (%)") +
scale_size_continuous(name = "Rainbow\nScore 2019") +
labs(title = "Relationship between Employment Rates and LGBT Support",
subtitle = "Color indicates unemployment rate, size shows level of LGBT legal protection",
x = "Employment Rate (%)",
y = "LGBT Support (%)") +
theme_minimal() +
theme(panel.grid.minor = element_blank())
# plot 4: relationship between GDP per capita to LGBT support
p_4 <- ggplot(country_level_df, aes(x = gdp_2018, y = lgbt_support_percent)) +
geom_point(aes(color = rainbow_score_2019, size = gdp_growth)) +
geom_text(aes(label = iso2), hjust = -0.3, vjust = 0, size = 3) +
scale_color_viridis_c(option = "E", name = "Rainbow\nScore 2019") +
scale_size_continuous(name = "GDP Growth\n2005-2018 (%)") +
scale_x_continuous(labels = comma) +
labs(title = "Relationship between GDP, LGBT Support and Rights Protections",
subtitle = "Size indicates GDP growth rate from 2005-2018",
x = "GDP per Capita (2018, USD)",
y = "LGBT Support (%)") +
theme_minimal() +
theme(panel.grid.minor = element_blank())
# display the plots
p_1
p_2
p_3
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).
p_4
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Removed 4 rows containing missing values or values outside the scale range
## (`geom_text()`).
library(dplyr)
library(readr)
library(kableExtra)
#df_rf_enriched_new <- readRDS("df_rf_enriched_new.rds")
df_reduced <- df_rf_enriched_new %>%
select(
# target
qc19,
# country identifier
country, country_name, isocntry,
# individual demographics
age = d11,
gender = d10,
education = d8, d8r2,
occupation = d15a,
urban_rural = d25,
financial_insecurity = d60,
social_class = d63,
religion = sd3,
political_ideology = d1,
lgb_friends = sd1_4,
trans_friends = sd1_7,
# Personal identity
ethnic_minority = sd2_1,
skin_color_minority = sd2_2,
religious_minority = sd2_3,
sexual_lgbt_minority = sd2_5,
disability_minority = sd2_6,
other_minority = sd2_7,
none_minority = sd2_8,
# Discrimination
trans_discrimination_country = qc1_8,
trans_discrimination_personal = qc2_6,
trans_discrimination_workplace = qc4_8,
trans_discrimination_political = qc6_10,
country_discrimination_efforts = qc7,
country_discrimination_efforts_recoded = qc7r,
trans_workplace_diversity = qc9_10,
trans_colleague = qc12_11,
trans_colleague_recoded = qc12_11r,
trans_child_relationship = qc13_11,
trans_child_relationship_recoded = qc13_11r,
lgb_rights = qc15_1,
same_sex_relationship = qc15_2,
same_sex_marriage = qc15_3,
lgb_school_materials = qc17_3,
trans_school_materials = qc17_4,
intersex_school_materials = qc17_5,
two_men_public_affection = qc18_2,
two_men_public_affection_recoded = qc18_2r,
two_women_public_affection = qc18_3,
two_women_public_affection_recoded = qc18_3r,
non_gendered_docs = qc20,
# Country-level variables
gdp_2018, # Economic development
rainbow_score_2019, # LGBTI rights/protections
gender_equality_index, # Gender equality
Happiness_Score, # National well-being
gdp_2018, # Economic development
rainbow_score_2019, # LGBTI rights/protections
rainbow_score_2018, # LGBTI rights/protections (previous year)
rainbow_score_avg_2019_2018, # Average LGBTI rights/protections
gender_equality_index, # Gender equality
Happiness_Score, # National well-being
v2x_libdem, # Democratic quality
v2x_egaldem, # Democratic quality
Regime_type, # Democratic quality
z_functioning_of_government, #Govenment function measure
composite_equality, # Equality measure
z_composite_equality, # Standardized equality measure
norm_composite_equality, # Normalized equality measure
z_lgbt_support, # Standardized LGBT support
norm_lgbt_support, # Normalized LGBT support
pct_friends_lgbt, # Percentage of LGBT friends
z_pct_friends_lgbt, # Standardized % of LGBT friends
norm_pct_friends_lgbt, # Normalized % of LGBT friends
mean_left_right, # Average political leaning
pct_high_education, # Education level
region, # Geographic region
# Paradata
interview_date = p1,
interview_start_time = p2,
interview_duration = p3,
interview_duration_recoded = p3r,
people_present_during_interview = p4,
respondent_cooperation = p5
)
# Save to a new file
write_rds(df_reduced, "df_reduced.rds")
# packages
library(tidyverse)
library(corrplot)
## corrplot 0.95 loaded
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggrepel)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
library(DALEX)
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
# load datasets
df_reduced <- readRDS("df_reduced.rds")
country_data_combined <- readRDS("country_data_combined.rds")
# first calculate country aggregates for individual variables so that we have
# continuous data for which we can more easily calculate correlations
df_reduced_aggregated <- df_reduced %>%
group_by(country_name, isocntry) %>%
summarize(
# target
pct_support_trans = mean(qc19 == 1, na.rm = TRUE) * 100,
pct_oppose_trans = mean(qc19 == 2, na.rm = TRUE) * 100,
pct_dk_trans = mean(qc19 == 3, na.rm = TRUE) * 100,
# demographics
mean_age = mean(age, na.rm = TRUE),
pct_female = mean(gender == 2, na.rm = TRUE) * 100,
mean_education_years = mean(education, na.rm = TRUE),
pct_high_educ = mean(d8r2 >= 3, na.rm = TRUE) * 100,
pct_rural = mean(urban_rural == 1, na.rm = TRUE) * 100,
pct_urban = mean(urban_rural == 2, na.rm = TRUE) * 100,
pct_large_urban = mean(urban_rural == 3, na.rm = TRUE) * 100,
pct_financial_difficulty = mean(financial_insecurity <= 2, na.rm = TRUE) * 100,
pct_working_class = mean(social_class == 1, na.rm = TRUE) * 100,
pct_middle_class = mean(social_class %in% c(2,3,4), na.rm = TRUE) * 100,
pct_upper_class = mean(social_class == 5, na.rm = TRUE) * 100,
# values and identity
mean_political_ideology = mean(political_ideology, na.rm = TRUE),
pct_left = mean(political_ideology <= 3, na.rm = TRUE) * 100,
pct_center = mean(political_ideology > 3 & political_ideology < 8, na.rm = TRUE) * 100,
pct_right = mean(political_ideology >= 8, na.rm = TRUE) * 100,
# religious composition
pct_religious = mean(religion %in% c(1:11), na.rm = TRUE) * 100,
pct_catholic = mean(religion == 1, na.rm = TRUE) * 100,
pct_orthodox = mean(religion == 2, na.rm = TRUE) * 100,
pct_protestant = mean(religion == 3, na.rm = TRUE) * 100,
pct_other_christian = mean(religion == 4, na.rm = TRUE) * 100,
pct_muslim = mean(religion %in% c(6:8), na.rm = TRUE) * 100,
pct_other_religion = mean(religion %in% c(9:11), na.rm = TRUE) * 100,
pct_atheist = mean(religion == 13, na.rm = TRUE) * 100,
pct_nonbeliever = mean(religion == 14, na.rm = TRUE) * 100,
pct_nonreligious = mean(religion %in% c(13,14), na.rm = TRUE) * 100,
# social networks and contact
pct_lgb_friends = mean(lgb_friends == 1, na.rm = TRUE) * 100,
pct_trans_friends = mean(trans_friends == 1, na.rm = TRUE) * 100,
# personal identity/minority status
pct_ethnic_minority = mean(ethnic_minority == 1, na.rm = TRUE) * 100,
pct_skin_color_minority = mean(skin_color_minority == 1, na.rm = TRUE) * 100,
pct_religious_minority = mean(religious_minority == 1, na.rm = TRUE) * 100,
pct_sexual_minority = mean(sexual_lgbt_minority == 1, na.rm = TRUE) * 100,
pct_disability_minority = mean(disability_minority == 1, na.rm = TRUE) * 100,
pct_other_minority = mean(other_minority == 1, na.rm = TRUE) * 100,
pct_no_minority = mean(none_minority == 1, na.rm = TRUE) * 100,
# discrimination perceptions; we need to be careful with interpretation/ coding
mean_trans_discrim_country = mean(trans_discrimination_country, na.rm = TRUE),
pct_perceive_trans_discrim = mean(trans_discrimination_country %in% c(1,2), na.rm = TRUE) * 100,
pct_experienced_trans_discrim = mean(trans_discrimination_personal == 1, na.rm = TRUE) * 100,
pct_workplace_trans_discrim = mean(trans_discrimination_workplace == 1, na.rm = TRUE) * 100,
# political representation comfort - original scale (1-10, higher = more comfortable)
mean_trans_political_comfort = mean(trans_discrimination_political, na.rm = TRUE),
pct_comfortable_trans_political = mean(trans_discrimination_political >= 7, na.rm = TRUE) * 100,
# effectiveness of discrimination efforts (original scale: 1-10, higher = more effective)
mean_country_discrimination_efforts = mean(country_discrimination_efforts, na.rm = TRUE),
# workplace diversity (1=Yes definitely, 4=No definitely not)
mean_trans_workplace_diversity = mean(trans_workplace_diversity, na.rm = TRUE),
pct_support_trans_workplace_diversity = mean(trans_workplace_diversity %in% c(1,2), na.rm = TRUE) * 100,
# LGBT attitudes - REVERSED for these scales where 1=Totally agree, 4=Totally disagree
# Create reversed versions so higher = more supportive
mean_lgb_rights_rev = mean(case_when(
lgb_rights == 1 ~ 4,
lgb_rights == 2 ~ 3,
lgb_rights == 3 ~ 2,
lgb_rights == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_same_sex_relationship_rev = mean(case_when(
same_sex_relationship == 1 ~ 4,
same_sex_relationship == 2 ~ 3,
same_sex_relationship == 3 ~ 2,
same_sex_relationship == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_same_sex_marriage_rev = mean(case_when(
same_sex_marriage == 1 ~ 4,
same_sex_marriage == 2 ~ 3,
same_sex_marriage == 3 ~ 2,
same_sex_marriage == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
# also calculate percentage who agree with LGBT rights (original values 1 or 2)
pct_support_lgb_rights = mean(lgb_rights %in% c(1,2), na.rm = TRUE) * 100,
pct_support_same_sex_relationship = mean(same_sex_relationship %in% c(1,2), na.rm = TRUE) * 100,
pct_support_same_sex_marriage = mean(same_sex_marriage %in% c(1,2), na.rm = TRUE) * 100,
# school materials (1=Totally agree, 4=Totally disagree)
# Create reversed versions so higher = more supportive
mean_lgb_school_materials_rev = mean(case_when(
lgb_school_materials == 1 ~ 4,
lgb_school_materials == 2 ~ 3,
lgb_school_materials == 3 ~ 2,
lgb_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_trans_school_materials_rev = mean(case_when(
trans_school_materials == 1 ~ 4,
trans_school_materials == 2 ~ 3,
trans_school_materials == 3 ~ 2,
trans_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
mean_intersex_school_materials_rev = mean(case_when(
intersex_school_materials == 1 ~ 4,
intersex_school_materials == 2 ~ 3,
intersex_school_materials == 3 ~ 2,
intersex_school_materials == 4 ~ 1,
TRUE ~ NA_real_), na.rm = TRUE),
# also calculate percentage who agree with inclusive school materials
pct_support_lgb_school_materials = mean(lgb_school_materials %in% c(1,2), na.rm = TRUE) * 100,
pct_support_trans_school_materials = mean(trans_school_materials %in% c(1,2), na.rm = TRUE) * 100,
pct_support_intersex_school_materials = mean(intersex_school_materials %in% c(1,2), na.rm = TRUE) * 100,
# comfort with same-sex public displays of affection (1-10)
mean_men_pda_comfort = mean(two_men_public_affection, na.rm = TRUE),
mean_women_pda_comfort = mean(two_women_public_affection, na.rm = TRUE),
pct_comfortable_men_pda = mean(two_men_public_affection >= 7, na.rm = TRUE) * 100,
pct_comfortable_women_pda = mean(two_women_public_affection >= 7, na.rm = TRUE) * 100,
# comfort with transgender colleagues/children (1-10)
mean_trans_colleague_comfort = mean(trans_colleague, na.rm = TRUE),
mean_trans_child_relationship = mean(trans_child_relationship, na.rm = TRUE),
pct_comfortable_trans_colleague = mean(trans_colleague >= 7, na.rm = TRUE) * 100,
pct_comfortable_trans_child = mean(trans_child_relationship >= 7, na.rm = TRUE) * 100,
# support for non-gendered documents
pct_support_nongendered_docs = mean(non_gendered_docs == 1, na.rm = TRUE) * 100,
pct_oppose_nongendered_docs = mean(non_gendered_docs == 2, na.rm = TRUE) * 100,
pct_dk_nongendered_docs = mean(non_gendered_docs == 3, na.rm = TRUE) * 100) %>%
ungroup()
## `summarise()` has grouped output by 'country_name'. You can override using the
## `.groups` argument.
# add country-level variables from the original dataset
# extract country-level variables that don't need to be aggregated
country_level_variables <- df_reduced %>%
select(country_name, gdp_2018, rainbow_score_2019, rainbow_score_2018,
gender_equality_index, Happiness_Score,
v2x_libdem, v2x_egaldem, Regime_type, composite_equality,
z_composite_equality, norm_composite_equality, z_lgbt_support,
norm_lgbt_support, pct_friends_lgbt, z_pct_friends_lgbt,
norm_pct_friends_lgbt, mean_left_right, pct_high_education, region) %>%
distinct()
# merge with aggregated data
df_reduced_aggregated_complete <- df_reduced_aggregated %>%
left_join(country_level_variables, by = "country_name")
# what we could add
setdiff(colnames(country_data_combined), country_level_variables)
## [1] "country_name"
## [2] "iso2"
## [3] "v2x_libdem"
## [4] "v2x_polyarchy"
## [5] "v2x_gender"
## [6] "v2x_egaldem"
## [7] "v2x_liberal"
## [8] "v2xcs_ccsi"
## [9] "v2x_freexp"
## [10] "Overall_score"
## [11] "Electoral_process_and_pluralism"
## [12] "Functioning_of_government"
## [13] "Political_participation"
## [14] "Political_culture"
## [15] "Civil_liberties"
## [16] "Regime_type"
## [17] "gdp_2005"
## [18] "gdp_2018"
## [19] "gdp_growth"
## [20] "rainbow_score_2019"
## [21] "rainbow_score_2018"
## [22] "rainbow_score_avg_2019_2018"
## [23] "rainbow_score_avg_2013_2014"
## [24] "rainbow_score_difference"
## [25] "Happiness_Score"
## [26] "Unemployment"
## [27] "Employment"
## [28] "gender_equality_index"
## [29] "z_v2x_libdem"
## [30] "z_v2x_polyarchy"
## [31] "z_v2x_gender"
## [32] "z_v2x_egaldem"
## [33] "z_v2x_liberal"
## [34] "z_v2xcs_ccsi"
## [35] "z_v2x_freexp"
## [36] "z_gdp_2018"
## [37] "log_gdp_2018"
## [38] "z_gdp_growth"
## [39] "z_unemployment"
## [40] "z_rainbow_score"
## [41] "norm_rainbow_score"
## [42] "z_lgbt_support"
## [43] "norm_lgbt_support"
## [44] "z_happiness"
## [45] "z_gender_equality"
## [46] "norm_gender_equality"
## [47] "z_functioning_of_government"
## [48] "composite_equality"
## [49] "composite_democracy"
## [50] "region"
## [51] "isocntry"
## [52] "mean_age"
## [53] "median_age"
## [54] "mean_life_satisfaction"
## [55] "pct_satisfied"
## [56] "mean_natl_political_discuss"
## [57] "mean_eu_political_discuss"
## [58] "mean_local_political_discuss"
## [59] "pct_discuss_natl_politics"
## [60] "pct_discuss_eu_politics"
## [61] "pct_discuss_local_politics"
## [62] "mean_trade_support"
## [63] "pct_trade_support"
## [64] "pct_pro_globalization"
## [65] "pct_anti_globalization"
## [66] "mean_eu_trade_support"
## [67] "pct_support_eu_trade"
## [68] "mean_esg_support"
## [69] "pct_support_esg"
## [70] "mean_left_right"
## [71] "pct_left"
## [72] "pct_center"
## [73] "pct_right"
## [74] "mean_education_years"
## [75] "median_education_years"
## [76] "pct_high_education"
## [77] "pct_rural"
## [78] "pct_urban"
## [79] "pct_large_urban"
## [80] "mean_financial_difficulty"
## [81] "pct_financial_difficulty"
## [82] "mean_subjective_class"
## [83] "pct_working_class"
## [84] "pct_middle_class"
## [85] "pct_upper_class"
## [86] "mean_voice_in_eu"
## [87] "mean_voice_in_country"
## [88] "pct_voice_in_eu"
## [89] "pct_voice_in_country"
## [90] "pct_friends_diff_ethnic"
## [91] "pct_friends_diff_skin"
## [92] "pct_friends_roma"
## [93] "pct_friends_lgbt"
## [94] "pct_friends_disabled"
## [95] "pct_friends_diff_religion"
## [96] "pct_friends_transgender"
## [97] "pct_friends_intersex"
## [98] "pct_ethnic_minority"
## [99] "pct_skin_minority"
## [100] "pct_religious_minority"
## [101] "pct_roma"
## [102] "pct_sexual_minority"
## [103] "pct_disability"
## [104] "pct_other_minority"
## [105] "pct_any_minority"
## [106] "pct_catholic"
## [107] "pct_orthodox"
## [108] "pct_protestant"
## [109] "pct_other_christian"
## [110] "pct_jewish"
## [111] "pct_muslim"
## [112] "pct_atheist"
## [113] "pct_nonbeliever"
## [114] "pct_nonreligious"
## [115] "mean_discrim_ethnic"
## [116] "mean_discrim_skin"
## [117] "mean_discrim_roma"
## [118] "mean_discrim_lgbt"
## [119] "mean_discrim_age"
## [120] "mean_discrim_religion"
## [121] "mean_discrim_disability"
## [122] "mean_discrim_transgender"
## [123] "mean_discrim_gender"
## [124] "mean_discrim_intersex"
## [125] "z_Overall_score"
## [126] "z_Electoral_process_and_pluralism"
## [127] "z_Functioning_of_government"
## [128] "z_Political_participation"
## [129] "z_Political_culture"
## [130] "z_Civil_liberties"
## [131] "z_gdp_2005"
## [132] "z_rainbow_score_2019"
## [133] "z_rainbow_score_2018"
## [134] "z_rainbow_score_avg_2019_2018"
## [135] "z_rainbow_score_avg_2013_2014"
## [136] "z_rainbow_score_difference"
## [137] "z_Happiness_Score"
## [138] "z_Unemployment"
## [139] "z_Employment"
## [140] "z_gender_equality_index"
## [141] "z_log_gdp_2018"
## [142] "z_composite_equality"
## [143] "z_composite_democracy"
## [144] "z_mean_age"
## [145] "z_median_age"
## [146] "z_mean_life_satisfaction"
## [147] "z_pct_satisfied"
## [148] "z_mean_natl_political_discuss"
## [149] "z_mean_eu_political_discuss"
## [150] "z_mean_local_political_discuss"
## [151] "z_pct_discuss_natl_politics"
## [152] "z_pct_discuss_eu_politics"
## [153] "z_pct_discuss_local_politics"
## [154] "z_mean_trade_support"
## [155] "z_pct_trade_support"
## [156] "z_pct_pro_globalization"
## [157] "z_pct_anti_globalization"
## [158] "z_mean_eu_trade_support"
## [159] "z_pct_support_eu_trade"
## [160] "z_mean_esg_support"
## [161] "z_pct_support_esg"
## [162] "z_mean_left_right"
## [163] "z_pct_left"
## [164] "z_pct_center"
## [165] "z_pct_right"
## [166] "z_mean_education_years"
## [167] "z_median_education_years"
## [168] "z_pct_high_education"
## [169] "z_pct_rural"
## [170] "z_pct_urban"
## [171] "z_pct_large_urban"
## [172] "z_mean_financial_difficulty"
## [173] "z_pct_financial_difficulty"
## [174] "z_mean_subjective_class"
## [175] "z_pct_working_class"
## [176] "z_pct_middle_class"
## [177] "z_pct_upper_class"
## [178] "z_mean_voice_in_eu"
## [179] "z_pct_voice_in_eu"
## [180] "z_pct_friends_diff_ethnic"
## [181] "z_pct_friends_diff_skin"
## [182] "z_pct_friends_roma"
## [183] "z_pct_friends_lgbt"
## [184] "z_pct_friends_disabled"
## [185] "z_pct_friends_diff_religion"
## [186] "z_pct_friends_transgender"
## [187] "z_pct_friends_intersex"
## [188] "z_pct_ethnic_minority"
## [189] "z_pct_skin_minority"
## [190] "z_pct_religious_minority"
## [191] "z_pct_roma"
## [192] "z_pct_sexual_minority"
## [193] "z_pct_disability"
## [194] "z_pct_other_minority"
## [195] "z_pct_any_minority"
## [196] "z_pct_catholic"
## [197] "z_pct_orthodox"
## [198] "z_pct_protestant"
## [199] "z_pct_other_christian"
## [200] "z_pct_jewish"
## [201] "z_pct_muslim"
## [202] "z_pct_atheist"
## [203] "z_pct_nonbeliever"
## [204] "z_pct_nonreligious"
## [205] "z_mean_discrim_ethnic"
## [206] "z_mean_discrim_skin"
## [207] "z_mean_discrim_roma"
## [208] "z_mean_discrim_lgbt"
## [209] "z_mean_discrim_age"
## [210] "z_mean_discrim_religion"
## [211] "z_mean_discrim_disability"
## [212] "z_mean_discrim_transgender"
## [213] "z_mean_discrim_gender"
## [214] "z_mean_discrim_intersex"
## [215] "norm_v2x_libdem"
## [216] "norm_v2x_polyarchy"
## [217] "norm_v2x_gender"
## [218] "norm_v2x_egaldem"
## [219] "norm_v2x_liberal"
## [220] "norm_v2xcs_ccsi"
## [221] "norm_v2x_freexp"
## [222] "norm_Overall_score"
## [223] "norm_Electoral_process_and_pluralism"
## [224] "norm_Functioning_of_government"
## [225] "norm_Political_participation"
## [226] "norm_Political_culture"
## [227] "norm_Civil_liberties"
## [228] "norm_gdp_2005"
## [229] "norm_gdp_2018"
## [230] "norm_gdp_growth"
## [231] "norm_rainbow_score_2019"
## [232] "norm_rainbow_score_2018"
## [233] "norm_rainbow_score_avg_2019_2018"
## [234] "norm_rainbow_score_avg_2013_2014"
## [235] "norm_rainbow_score_difference"
## [236] "norm_Happiness_Score"
## [237] "norm_Unemployment"
## [238] "norm_Employment"
## [239] "norm_gender_equality_index"
## [240] "norm_log_gdp_2018"
## [241] "norm_composite_equality"
## [242] "norm_composite_democracy"
## [243] "norm_mean_age"
## [244] "norm_median_age"
## [245] "norm_mean_life_satisfaction"
## [246] "norm_pct_satisfied"
## [247] "norm_mean_natl_political_discuss"
## [248] "norm_mean_eu_political_discuss"
## [249] "norm_mean_local_political_discuss"
## [250] "norm_pct_discuss_natl_politics"
## [251] "norm_pct_discuss_eu_politics"
## [252] "norm_pct_discuss_local_politics"
## [253] "norm_mean_trade_support"
## [254] "norm_pct_trade_support"
## [255] "norm_pct_pro_globalization"
## [256] "norm_pct_anti_globalization"
## [257] "norm_mean_eu_trade_support"
## [258] "norm_pct_support_eu_trade"
## [259] "norm_mean_esg_support"
## [260] "norm_pct_support_esg"
## [261] "norm_mean_left_right"
## [262] "norm_pct_left"
## [263] "norm_pct_center"
## [264] "norm_pct_right"
## [265] "norm_mean_education_years"
## [266] "norm_median_education_years"
## [267] "norm_pct_high_education"
## [268] "norm_pct_rural"
## [269] "norm_pct_urban"
## [270] "norm_pct_large_urban"
## [271] "norm_mean_financial_difficulty"
## [272] "norm_pct_financial_difficulty"
## [273] "norm_mean_subjective_class"
## [274] "norm_pct_working_class"
## [275] "norm_pct_middle_class"
## [276] "norm_pct_upper_class"
## [277] "norm_mean_voice_in_eu"
## [278] "norm_pct_voice_in_eu"
## [279] "norm_pct_friends_diff_ethnic"
## [280] "norm_pct_friends_diff_skin"
## [281] "norm_pct_friends_roma"
## [282] "norm_pct_friends_lgbt"
## [283] "norm_pct_friends_disabled"
## [284] "norm_pct_friends_diff_religion"
## [285] "norm_pct_friends_transgender"
## [286] "norm_pct_friends_intersex"
## [287] "norm_pct_ethnic_minority"
## [288] "norm_pct_skin_minority"
## [289] "norm_pct_religious_minority"
## [290] "norm_pct_roma"
## [291] "norm_pct_sexual_minority"
## [292] "norm_pct_disability"
## [293] "norm_pct_other_minority"
## [294] "norm_pct_any_minority"
## [295] "norm_pct_catholic"
## [296] "norm_pct_orthodox"
## [297] "norm_pct_protestant"
## [298] "norm_pct_other_christian"
## [299] "norm_pct_jewish"
## [300] "norm_pct_muslim"
## [301] "norm_pct_atheist"
## [302] "norm_pct_nonbeliever"
## [303] "norm_pct_nonreligious"
## [304] "norm_mean_discrim_ethnic"
## [305] "norm_mean_discrim_skin"
## [306] "norm_mean_discrim_roma"
## [307] "norm_mean_discrim_lgbt"
## [308] "norm_mean_discrim_age"
## [309] "norm_mean_discrim_religion"
## [310] "norm_mean_discrim_disability"
## [311] "norm_mean_discrim_transgender"
## [312] "norm_mean_discrim_gender"
## [313] "norm_mean_discrim_intersex"
country_data_combined_add <- country_data_combined %>%
select(iso2, norm_gdp_growth, norm_Unemployment, pct_trade_support, norm_gdp_2018,
z_gender_equality, z_rainbow_score, z_v2x_libdem, z_v2x_egaldem, z_happiness,
norm_gdp_growth, z_v2x_gender)
df_reduced_aggregated_complete <- df_reduced_aggregated_complete %>%
left_join(country_data_combined_add, by = c("isocntry" = "iso2"))
# Check the final dataset
dim(df_reduced_aggregated_complete)
## [1] 28 100
# save it
write_rds(df_reduced_aggregated_complete, "df_reduced_aggregated_complete")
### create different plots
# calculate correlation matrix for numeric variables
cor_matrix <- cor(select_if(df_reduced_aggregated_complete, is.numeric),
use = "pairwise.complete.obs")
## Warning in cor(select_if(df_reduced_aggregated_complete, is.numeric), use =
## "pairwise.complete.obs"): the standard deviation is zero
# (1) visualize complete correlation matrix
corrplot(cor_matrix, method = "color", type = "upper",
diag = F,
tl.col = "black", tl.srt = 45,
number.cex = 0.7, tl.cex = 0.7)
# (2) create custom function to analyze correlations with outcome
analyze_correlations_with_outcome <- function(data, outcome_var, min_correlation = 0.3) {
# numeric columns only
numeric_data <- data %>% select(where(is.numeric))
# calculate correlations with outcome
cors <- cor(numeric_data, numeric_data[[outcome_var]], use = "pairwise.complete.obs")
# convert to data frame
cor_df <- data.frame(
variable = rownames(cors),
correlation = cors[,1]) %>%
# remove the outcome variable correlating with itself
filter(variable != outcome_var) %>%
# sort by absolute correlation
arrange(desc(abs(correlation)))
# filter by minimum correlation threshold
cor_df <- cor_df %>% filter(abs(correlation) >= min_correlation)
return(cor_df)
}
# run correlation analysis for transgender support
trans_support_correlations <- analyze_correlations_with_outcome(
df_reduced_aggregated_complete, "pct_support_trans", min_correlation = 0.25)
## Warning in cor(numeric_data, numeric_data[[outcome_var]], use =
## "pairwise.complete.obs"): the standard deviation is zero
# display results
print(trans_support_correlations)
## variable
## pct_oppose_trans pct_oppose_trans
## pct_support_nongendered_docs pct_support_nongendered_docs
## pct_oppose_nongendered_docs pct_oppose_nongendered_docs
## pct_support_lgb_school_materials pct_support_lgb_school_materials
## z_lgbt_support z_lgbt_support
## norm_lgbt_support norm_lgbt_support
## pct_support_lgb_rights pct_support_lgb_rights
## mean_lgb_rights_rev mean_lgb_rights_rev
## pct_support_intersex_school_materials pct_support_intersex_school_materials
## pct_support_trans_school_materials pct_support_trans_school_materials
## pct_support_same_sex_marriage pct_support_same_sex_marriage
## pct_support_same_sex_relationship pct_support_same_sex_relationship
## pct_comfortable_trans_colleague pct_comfortable_trans_colleague
## mean_trans_colleague_comfort mean_trans_colleague_comfort
## mean_lgb_school_materials_rev mean_lgb_school_materials_rev
## mean_same_sex_marriage_rev mean_same_sex_marriage_rev
## mean_same_sex_relationship_rev mean_same_sex_relationship_rev
## mean_intersex_school_materials_rev mean_intersex_school_materials_rev
## mean_trans_school_materials_rev mean_trans_school_materials_rev
## mean_men_pda_comfort mean_men_pda_comfort
## norm_pct_friends_lgbt norm_pct_friends_lgbt
## pct_lgb_friends pct_lgb_friends
## pct_friends_lgbt pct_friends_lgbt
## z_pct_friends_lgbt z_pct_friends_lgbt
## pct_comfortable_men_pda pct_comfortable_men_pda
## mean_women_pda_comfort mean_women_pda_comfort
## pct_comfortable_trans_political pct_comfortable_trans_political
## pct_comfortable_women_pda pct_comfortable_women_pda
## mean_trans_political_comfort mean_trans_political_comfort
## mean_trans_child_relationship mean_trans_child_relationship
## pct_comfortable_trans_child pct_comfortable_trans_child
## z_v2x_egaldem z_v2x_egaldem
## v2x_egaldem v2x_egaldem
## composite_equality composite_equality
## z_composite_equality z_composite_equality
## norm_composite_equality norm_composite_equality
## rainbow_score_2018 rainbow_score_2018
## z_v2x_libdem z_v2x_libdem
## v2x_libdem v2x_libdem
## rainbow_score_2019 rainbow_score_2019
## z_rainbow_score z_rainbow_score
## z_gender_equality z_gender_equality
## gender_equality_index gender_equality_index
## pct_right pct_right
## pct_trans_friends pct_trans_friends
## mean_political_ideology mean_political_ideology
## mean_left_right mean_left_right
## Happiness_Score Happiness_Score
## z_happiness z_happiness
## norm_gdp_2018 norm_gdp_2018
## gdp_2018 gdp_2018
## pct_center pct_center
## norm_gdp_growth norm_gdp_growth
## mean_trans_discrim_country mean_trans_discrim_country
## pct_perceive_trans_discrim pct_perceive_trans_discrim
## pct_high_education pct_high_education
## pct_high_educ pct_high_educ
## pct_workplace_trans_discrim pct_workplace_trans_discrim
## pct_financial_difficulty pct_financial_difficulty
## pct_female pct_female
## pct_left pct_left
## mean_age mean_age
## pct_atheist pct_atheist
## pct_trade_support pct_trade_support
## z_v2x_gender z_v2x_gender
## pct_orthodox pct_orthodox
## pct_religious pct_religious
## pct_nonreligious pct_nonreligious
## pct_experienced_trans_discrim pct_experienced_trans_discrim
## pct_urban pct_urban
## pct_support_trans_workplace_diversity pct_support_trans_workplace_diversity
## pct_no_minority pct_no_minority
## pct_large_urban pct_large_urban
## pct_protestant pct_protestant
## pct_other_religion pct_other_religion
## pct_ethnic_minority pct_ethnic_minority
## mean_country_discrimination_efforts mean_country_discrimination_efforts
## correlation
## pct_oppose_trans -1.0000000
## pct_support_nongendered_docs 0.9552632
## pct_oppose_nongendered_docs -0.9552632
## pct_support_lgb_school_materials 0.9099915
## z_lgbt_support 0.9055394
## norm_lgbt_support 0.9055394
## pct_support_lgb_rights 0.9054333
## mean_lgb_rights_rev 0.8873981
## pct_support_intersex_school_materials 0.8853638
## pct_support_trans_school_materials 0.8828781
## pct_support_same_sex_marriage 0.8820719
## pct_support_same_sex_relationship 0.8799193
## pct_comfortable_trans_colleague 0.8795309
## mean_trans_colleague_comfort 0.8697944
## mean_lgb_school_materials_rev 0.8679107
## mean_same_sex_marriage_rev 0.8669420
## mean_same_sex_relationship_rev 0.8656658
## mean_intersex_school_materials_rev 0.8479568
## mean_trans_school_materials_rev 0.8459165
## mean_men_pda_comfort 0.8398369
## norm_pct_friends_lgbt 0.8350491
## pct_lgb_friends 0.8350491
## pct_friends_lgbt 0.8350491
## z_pct_friends_lgbt 0.8350491
## pct_comfortable_men_pda 0.8313883
## mean_women_pda_comfort 0.8219693
## pct_comfortable_trans_political 0.8144761
## pct_comfortable_women_pda 0.8135901
## mean_trans_political_comfort 0.8080428
## mean_trans_child_relationship 0.8008226
## pct_comfortable_trans_child 0.7938314
## z_v2x_egaldem 0.7909474
## v2x_egaldem 0.7909474
## composite_equality 0.7752634
## z_composite_equality 0.7752634
## norm_composite_equality 0.7752634
## rainbow_score_2018 0.7539898
## z_v2x_libdem 0.7147897
## v2x_libdem 0.7147897
## rainbow_score_2019 0.7094428
## z_rainbow_score 0.7094428
## z_gender_equality 0.7035475
## gender_equality_index 0.7035475
## pct_right -0.6890892
## pct_trans_friends 0.6648839
## mean_political_ideology -0.6422553
## mean_left_right -0.6422553
## Happiness_Score 0.6310032
## z_happiness 0.6310032
## norm_gdp_2018 0.5771817
## gdp_2018 0.5771817
## pct_center 0.5523491
## norm_gdp_growth -0.5522688
## mean_trans_discrim_country -0.5172765
## pct_perceive_trans_discrim 0.5008079
## pct_high_education 0.4980007
## pct_high_educ 0.4948097
## pct_workplace_trans_discrim 0.4664769
## pct_financial_difficulty -0.4504279
## pct_female -0.4496147
## pct_left 0.4253939
## mean_age 0.4017389
## pct_atheist 0.4001147
## pct_trade_support 0.3991507
## z_v2x_gender 0.3977959
## pct_orthodox -0.3976753
## pct_religious -0.3825894
## pct_nonreligious 0.3724599
## pct_experienced_trans_discrim -0.3707182
## pct_urban 0.3530711
## pct_support_trans_workplace_diversity 0.3494914
## pct_no_minority 0.3146637
## pct_large_urban -0.3090152
## pct_protestant 0.3082064
## pct_other_religion 0.2785021
## pct_ethnic_minority -0.2752098
## mean_country_discrimination_efforts 0.2604523
# create a visualization of top correlations
ggplot(trans_support_correlations %>% head(15),
aes(x = reorder(variable, correlation), y = correlation)) +
geom_bar(stat = "identity", aes(fill = correlation > 0)) +
coord_flip() +
labs(title = "Top correlated variables of transgender rights support",
subtitle = "Country-level aggregated data",
x = "Variables",
y = "Correlation with % supporting transgender document changes") +
theme_minimal() +
scale_fill_manual(values = c("firebrick", "steelblue"),
name = "Direction",
labels = c("Negative", "Positive"))
# (3) create correlation matrix for key variables
key_vars <- c("pct_support_trans",
trans_support_correlations$variable[1:25], # top 25 correlates
"gdp_2018", "rainbow_score_2019", "gender_equality_index")
key_cor_matrix <- cor(df_reduced_aggregated_complete[, key_vars],
use = "pairwise.complete.obs")
corrplot(key_cor_matrix, method = "color",
order = "hclust", diag = F,
tl.col = "black", tl.srt = 45,
number.cex = 0.6, tl.cex = 0.7)
# (4) create correlation matrix for all country variables
key_vars_country <- c("pct_support_trans", "norm_gdp_2018", "rainbow_score_2019",
"z_gender_equality", "norm_Unemployment", "norm_gdp_growth",
"z_v2x_libdem", "z_v2x_egaldem","z_happiness", "z_v2x_gender")
key_cor_matrix_country <- cor(df_reduced_aggregated_complete[, key_vars_country],
use = "pairwise.complete.obs")
corrplot(key_cor_matrix_country, method = "color",
order = "hclust", diag = F, addCoef.col = "black",
tl.col = "black", tl.srt = 45,
number.cex = 0.6, tl.cex = 0.7)
###
# regional comparisons
region_summary <- df_reduced_aggregated_complete %>%
group_by(region) %>%
summarize(
n_countries = n(),
avg_trans_support = mean(pct_support_trans),
min_support = min(pct_support_trans),
max_support = max(pct_support_trans),
std_dev = sd(pct_support_trans)) %>%
arrange(desc(avg_trans_support))
# visualize regional differences
ggplot(df_reduced_aggregated_complete, aes(x = reorder(region, pct_support_trans, FUN = mean),
y = pct_support_trans)) +
geom_boxplot(aes(fill = region)) +
geom_jitter(width = 0.2, alpha = 0.5) +
coord_flip() +
labs(title = "Support for Transgender Rights by European Region",
x = "Region",
y = "% Supporting Transgender Rights") +
theme_minimal() +
theme(legend.position = "none")
### some regression
# create a scatterplot matrix with regression lines
ggplot(df_reduced_aggregated_complete,
aes(x = pct_nonreligious, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region)) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
x = "% Non-religious Population",
y = "% Supporting Transgender Rights",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(df_reduced_aggregated_complete,
aes(x = gender_equality_index, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region)) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 10) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Gender Equality and Transgender Rights Support",
x = "Gender Equality Index",
y = "% Supporting Transgender Rights",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
### clusters
cluster_vars <- c("pct_support_trans", "norm_gdp_2018",
"rainbow_score_2019", "gender_equality_index",
"v2x_libdem", "pct_catholic", "pct_nonreligious")
# Prepare data
cluster_data <- df_reduced_aggregated_complete %>%
select(country_name, all_of(cluster_vars)) %>%
na.omit() %>%
column_to_rownames("country_name")
# Scale the data
cluster_data_scaled <- scale(cluster_data)
# Compute distance matrix
dist_matrix <- dist(cluster_data_scaled, method = "euclidean")
# Hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrogram
plot(hc, main = "Clustering of Countries by Support for Transgender Rights",
sub = "", xlab = "", cex = 0.7)
rect.hclust(hc, k = 4, border = "red")
### summary table
country_summary <- df_reduced_aggregated_complete %>%
select(country_name, pct_support_trans, rainbow_score_2019,
gender_equality_index, norm_gdp_2018,
pct_nonreligious, v2x_libdem, gdp_2018) %>%
arrange(desc(pct_support_trans))
kable(country_summary,
col.names = c("Country", "Trans Rights Support (%)", "Rainbow Score",
"Gender Equality", "LGBT Comfort",
"Non-religious (%)", "Liberal Democracy", "GDP 2018"),
digits = 1) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(0, bold = TRUE) %>%
add_header_above(c(" " = 1, "Support Measures" = 2, "Equality Measures" = 2,
"Social Factors" = 2, "Economic" = 1))
| Country | Trans Rights Support (%) | Rainbow Score | Gender Equality | LGBT Comfort | Non-religious (%) | Liberal Democracy | GDP 2018 |
|---|---|---|---|---|---|---|---|
| Spain | 89.1 | 58.5 | 68.3 | 0.2 | 21.7 | 0.8 | 41073.7 |
| Netherlands | 86.8 | 49.7 | 72.9 | 0.4 | 47.6 | 0.8 | 58818.0 |
| Malta | 82.2 | 74.7 | 60.1 | 0.3 | 4.8 | 0.6 | 48127.7 |
| Denmark | 79.5 | 60.3 | 76.8 | 0.4 | 14.2 | 0.9 | 57231.3 |
| France | 79.0 | 62.7 | 72.6 | 0.2 | 22.0 | 0.8 | 46397.5 |
| Luxembourg | 79.0 | 41.3 | 69.0 | 1.0 | 20.8 | 0.8 | 116334.0 |
| Germany | 78.9 | 47.5 | 65.5 | 0.3 | 28.1 | 0.8 | 56272.9 |
| Portugal | 75.4 | 57.5 | 56.0 | 0.1 | 9.7 | 0.8 | 34725.2 |
| Ireland | 74.1 | 44.7 | 69.5 | 0.7 | 10.0 | 0.8 | 86433.7 |
| Sweden | 73.1 | 52.9 | 82.6 | 0.3 | 37.2 | 0.9 | 53122.5 |
| Belgium | 73.0 | 70.4 | 70.5 | 0.3 | 24.9 | 0.8 | 52466.5 |
| Finland | 71.9 | 62.2 | 73.0 | 0.3 | 17.0 | 0.8 | 49242.7 |
| United Kingdom | 70.8 | 67.6 | 69.6 | 0.2 | 29.4 | 0.8 | 47108.0 |
| Austria | 61.8 | 48.5 | 63.3 | 0.4 | 21.5 | 0.8 | 56654.5 |
| Greece | 59.9 | 44.8 | 50.0 | 0.1 | 1.8 | 0.8 | 29792.0 |
| Estonia | 59.1 | 34.4 | 56.7 | 0.1 | 36.4 | 0.8 | 37201.1 |
| Cyprus | 53.5 | 22.0 | 55.1 | 0.2 | 2.0 | 0.7 | 40922.7 |
| Slovenia | 52.8 | 39.8 | 68.4 | 0.2 | 6.8 | 0.7 | 38656.2 |
| Italy | 49.9 | 19.4 | 62.1 | 0.2 | 12.2 | 0.8 | 43583.4 |
| Poland | 47.2 | 18.1 | 56.8 | 0.1 | 4.6 | 0.5 | 32360.9 |
| Latvia | 46.5 | 16.8 | 57.9 | 0.1 | 20.1 | 0.7 | 29832.1 |
| Croatia | 44.8 | 45.8 | 53.1 | 0.1 | 7.3 | 0.6 | 29043.9 |
| Czech Republic | 43.8 | 25.5 | 53.6 | 0.2 | 48.2 | 0.7 | 42016.4 |
| Lithuania | 41.7 | 19.9 | 56.8 | 0.1 | 7.0 | 0.8 | 36491.9 |
| Slovakia | 28.2 | 29.0 | 52.4 | 0.1 | 8.5 | 0.7 | 31514.2 |
| Romania | 23.6 | 20.7 | 52.4 | 0.1 | 1.6 | 0.6 | 29569.6 |
| Bulgaria | 16.7 | 25.8 | 58.0 | 0.0 | 4.3 | 0.5 | 24052.2 |
| Hungary | 16.2 | 42.8 | 50.8 | 0.1 | 17.4 | 0.4 | 32258.3 |
### check some bivariate relationships; choose three key demographic variblaes
# (1) age
ggplot(df_reduced, aes(x = age, y = as.numeric(qc19 == 1))) +
geom_jitter(alpha = 0.1, height = 0.05) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Relationship Between Age and Transgender Rights Support",
x = "Age", y = "Probability of Support") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# looks like age has a non-linear effect as the curve is flat for ages until ~60 and then falls off
# this suggests we should use a quadratic term for age
# (2) political ideology
ggplot(df_reduced, aes(x = political_ideology, y = as.numeric(qc19 == 1))) +
geom_jitter(alpha = 0.1, height = 0.05) +
geom_smooth(method = "loess", se = TRUE) +
labs(title = "Relationship Between Political Ideology and Transgender Rights Support",
x = "Political Ideology (Left to Right)",
y = "Probability of Support") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# (3a) religion
ggplot(df_reduced_aggregated_complete, aes(x = pct_nonreligious, y = pct_support_trans)) +
geom_point(aes(size = rainbow_score_2019, color = region), alpha = 0.8) +
geom_text_repel(aes(label = country_name), size = 3, max.overlaps = 15) +
geom_smooth(method = "lm", se = TRUE, color = "darkgray") +
labs(title = "Relationship Between Non-religiosity and Transgender Rights Support",
subtitle = "Country-level data from Special Eurobarometer 493 (2019)",
x = "% Non-religious Population",
y = "% Supporting Transgender Document Changes",
size = "Rainbow Score",
color = "Region") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# (3b) religion disaggregated
religion_support <- df_reduced %>%
mutate(religion_group = case_when(
religion == 1 ~ "Catholic",
religion == 2 ~ "Orthodox",
religion == 3 ~ "Protestant",
religion == 4 ~ "Other Christian",
religion %in% c(6:8) ~ "Muslim",
religion %in% c(9:11) ~ "Other Religion",
religion == 13 ~ "Atheist",
religion == 14 ~ "Non-believer",
TRUE ~ "Other/Missing")) %>%
group_by(religion_group) %>%
summarize(
support_rate = mean(qc19 == 1, na.rm = TRUE) * 100,
sample_size = n(),
se = sqrt((support_rate/100 * (1-support_rate/100)) / sample_size) * 100) %>%
filter(religion_group != "Other/Missing", sample_size >= 30) %>%
arrange(desc(support_rate))
ggplot(religion_support, aes(x = reorder(religion_group, support_rate), y = support_rate)) +
geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
geom_errorbar(aes(ymin = support_rate - 1.96*se,
ymax = support_rate + 1.96*se),
width = 0.2) +
geom_text(aes(label = sprintf("%.1f%%", support_rate), y = support_rate + 3),
vjust = 0) +
geom_text(aes(label = paste0("n=", sample_size), y = -5),
vjust = 1, size = 3) +
coord_flip() +
labs(title = "Support for Transgender Rights by Religious Affiliation",
subtitle = "Percentage supporting transgender document changes by religion",
x = NULL,
y = "Support Rate (%)") +
ylim(-5, max(religion_support$support_rate) + 10) +
theme_minimal() +
theme(
plot.title = element_text(face = "bold"),
panel.grid.major.y = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_text(face = "bold"))
### intermediate conclusion:
# based on the results, we should use the following variables in the multilevel
# regression model
# country-level variables:
# z_rainbow_score, z_gender_equality, regions, z_v2x_libdem, z_v2z_egaldem,
# norm_gdp_2018
### use ML for predictor selection (country variables)
# prepare data for random forest by selecting variables we considered before
df_reduced$region <- as.factor(df_reduced$region)
df_reduced$Regime_type <- as.factor(df_reduced$Regime_type)
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, ifelse(df_reduced$qc19 == 2, 0, NA)) %>%
as.factor() # convert target to binary
# remove NAs due to the coding before
df_reduced <- df_reduced %>%
na.omit()
# join
df_reduced_rf <- df_reduced %>%
left_join(country_data_combined, by = "country_name")
model_data <- df_reduced_rf %>%
select(region.x, z_v2x_libdem, z_v2x_egaldem, z_v2x_freexp, z_v2x_gender, z_v2x_liberal,
z_gender_equality_index, z_rainbow_score_2019, norm_gdp_2018, norm_Unemployment,
norm_Happiness_Score, norm_gdp_growth, z_Functioning_of_government,
z_Electoral_process_and_pluralism, qc19)
# split data into training and testing
set.seed(69420)
train_index <- createDataPartition(model_data$qc19, p = 0.7, list = FALSE)
train_data <- model_data[train_index, ]
test_data <- model_data[-train_index, ]
# train random forest
rf_model <- ranger(
qc19 ~ ., # Exclude country as a predictor
data = train_data,
importance = "impurity", # Gini importance
num.trees = 500,
mtry = floor(sqrt(ncol(train_data) - 2)), # Default mtry for classification
probability = TRUE)
# extract variable importance
var_importance <- importance(rf_model)
var_importance_df <- data.frame(
Variable = names(var_importance),
Importance = var_importance)
var_importance_df <- var_importance_df[order(var_importance_df$Importance, decreasing = TRUE), ]
# Plot variable importance
ggplot(var_importance_df, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_minimal() +
labs(title = "Variable Importance from Random Forest",
x = "Variables",
y = "Importance")
library(tidyverse)
library(lme4) # For multilevel modeling
library(lmerTest) # For p-values in multilevel models
##
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
##
## lmer
## The following object is masked from 'package:stats':
##
## step
library(ggplot2) # For data visualization
library(dplyr) # For data manipulation
library(sjstats) # For calculating ICC
##
## Attaching package: 'sjstats'
## The following object is masked from 'package:survey':
##
## cv
library(sjPlot) # For plotting model results
## #refugeeswelcome
library(modelsummary)
library(broom.mixed) # For extracting mixed model statistics
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(rtf)
##
## Attaching package: 'rtf'
## The following object is masked from 'package:purrr':
##
## done
## The following object is masked from 'package:tibble':
##
## view
library(haven)
library(MuMIn)
##
## Attaching package: 'MuMIn'
## The following object is masked from 'package:ranger':
##
## importance
## The following object is masked from 'package:randomForest':
##
## importance
data <- read_dta("data/raw/ZA7575.dta")
df_reduced <- readRDS("df_reduced.rds")
# restore the old NAs and do CCA
df_reduced$qc19 <- data$qc19
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 3, NA, df_reduced$qc19)
df_reduced <- df_reduced %>%
drop_na(qc19)
# quick data check
table(df_reduced$country_name)
##
## Austria Belgium Bulgaria Croatia Cyprus
## 957 1014 738 912 399
## Czech Republic Denmark Estonia Finland France
## 919 930 803 883 915
## Germany Greece Hungary Ireland Italy
## 1333 923 903 882 889
## Latvia Lithuania Luxembourg Malta Netherlands
## 847 851 468 448 980
## Poland Portugal Romania Slovakia Slovenia
## 818 911 905 886 920
## Spain Sweden United Kingdom
## 921 902 901
# Calculate the percentage of "Yes" (1) responses for qc19 by country
country_means <- aggregate(qc19 == 1 ~ country_name, data = df_reduced, FUN = mean)
# Multiply by 100 to get percentage and then sort
country_means$percentage <- country_means$`qc19 == 1` * 100
country_means <- country_means[order(country_means$percentage), ]
# Create the plot
ggplot(country_means, aes(x = reorder(country_name, percentage), y = percentage)) +
geom_bar(stat = "identity", fill = "steelblue") +
theme_minimal() +
labs(title = "Support for Transgender Rights by Country",
subtitle = "Percentage answering 'Yes' to allowing transgender people to change civil documents",
x = "Country",
y = "Support (%)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# scale some of the variables (both categorical and continuous) with larger scales
df_reduced <- df_reduced %>%
mutate(
age_z = scale(age),
religion_z = scale(as.numeric(religion)),
political_ideology_z = scale(political_ideology),
respondent_cooperation_z = scale(as.numeric(respondent_cooperation)),
rainbow_score_z = scale(rainbow_score_2019),
v2x_egaldem_z = scale(v2x_egaldem))
# Recode qc19 from 1 and 2 to 0 and 1 for the binary model
df_reduced$qc19 <- ifelse(df_reduced$qc19 == 1, 1, 0)
# Optimize the settings to avoid convergence problems in the glmer() functions
control = glmerControl(optimizer = "bobyqa",
optCtrl = list(maxfun = 100000))
# Recode some of the variables such that higher values translate to a more positive
# attitude towards LGBT rights in general -> simplieies coefficient interpretation
df_reduced <- df_reduced %>%
mutate(trans_friends = ifelse(trans_friends == 1, 2, 1), # trans_friends is sd1_7 (see Reduced dataset.R)
lgb_friends = ifelse(lgb_friends == 1, 2, 1)) # lgb_friends is sd1_4 (see Reduced dataset.R)
# STEP 1: Null Model (Intercept-only model)
null_model <- glmer(qc19 ~ 1 + (1|country_name), family = "binomial", data = df_reduced)
summary(null_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: qc19 ~ 1 + (1 | country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 28212.6 28228.8 -14104.3 28208.6 24156
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.0233 -0.9008 0.5066 0.6531 2.2755
##
## Random effects:
## Groups Name Variance Std.Dev.
## country_name (Intercept) 0.9649 0.9823
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4495 0.1856 2.422 0.0154 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Calculate ICC
performance::icc(null_model)
## # Intraclass Correlation Coefficient
##
## Adjusted ICC: 0.227
## Unadjusted ICC: 0.227
The null model indicates a statistically significant baseline probability of support for transgender individuals obtaining official documents (intercept = 0.45, p = 0.015), with substantial country-level variation accounting for 22.7% of the total variance in this support (ICC = 0.227).
# STEP 2: Add Individual-Level Predictors (Level 1)
# starting with predictors that showed correlation with qc19
model1 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1|country_name),
family = "binomial",
control = control,
data = df_reduced)
summary(model1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + (1 | country_name)
## Data: df_reduced
## Control: control
##
## AIC BIC logLik deviance df.resid
## 26411.0 26483.8 -13196.5 26393.0 24149
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.1228 -0.7612 0.3617 0.6576 4.7443
##
## Random effects:
## Groups Name Variance Std.Dev.
## country_name (Intercept) 0.6956 0.834
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.79076 0.17981 -9.959 < 2e-16 ***
## age_z -0.08163 0.01613 -5.060 4.20e-07 ***
## gender 0.38278 0.03068 12.477 < 2e-16 ***
## religion_z 0.10468 0.01767 5.924 3.14e-09 ***
## political_ideology_z -0.13256 0.01552 -8.543 < 2e-16 ***
## lgb_friends 0.88187 0.03648 24.173 < 2e-16 ***
## trans_friends 0.40647 0.06001 6.773 1.26e-11 ***
## respondent_cooperation_z -0.27127 0.01610 -16.845 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f
## age_z -0.070
## gender -0.246 -0.022
## religion_z 0.014 0.119 0.098
## pltcl_dlgy_ -0.021 0.045 0.007 0.070
## lgb_friends -0.187 0.211 -0.043 -0.087 0.032
## trans_frnds -0.297 0.036 -0.005 -0.025 0.015 -0.224
## rspndnt_cp_ -0.021 -0.078 -0.023 -0.004 -0.005 0.068 0.017
# compare with the null_model
anova(null_model, model1)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.7 7 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | |
|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
| (Intercept) | 0.449* | -1.791*** |
| (0.186) | (0.180) | |
| age_z | -0.082*** | |
| (0.016) | ||
| gender | 0.383*** | |
| (0.031) | ||
| religion_z | 0.105*** | |
| (0.018) | ||
| political_ideology_z | -0.133*** | |
| (0.016) | ||
| lgb_friends | 0.882*** | |
| (0.036) | ||
| trans_friends | 0.406*** | |
| (0.060) | ||
| respondent_cooperation_z | -0.271*** | |
| (0.016) | ||
| SD (Intercept country_name) | 0.982 | 0.834 |
| Num.Obs. | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 |
| BIC | 28228.8 | 26483.8 |
Model 1 significantly improves fit over the null model (χ² = 1720.5, p < 0.001, lower AIC/BIC). Transgender documentation is positively associated with characteristics such as being female and having LGB/transgender friends, while negatively associated with age and respondent cooperation during the survey.
# STEP 3: Add Country-Level Predictors (Level 2)
# We test three different models based on 'themes'. As we only have 28 level-2 units,
# we should not add more than 2-3 country-level variables to each model --> overfitting
# Political/ Economic development model
model_econ_pol <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
trans_friends + respondent_cooperation_z +
scale(gdp_2018) + z_functioning_of_government +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
# Social wellbeing model
model_wellbeing <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
trans_friends + respondent_cooperation_z +
scale(Happiness_Score) + scale(gender_equality_index) +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
# LGBTQ/equality values model
model_equality <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z + (1|country_name),
family = "binomial",
control = control,
data = df_reduced)
# decide which model is the best
anova(model_econ_pol, model_wellbeing, model_equality) # equality is the best
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_econ_pol: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(gdp_2018) + z_functioning_of_government + (1 + lgb_friends | country_name)
## model_wellbeing: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + scale(Happiness_Score) + scale(gender_equality_index) + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality 11 26382 26471 -13180 26360
## model_econ_pol 13 26401 26506 -13188 26375 0 2 1
## model_wellbeing 13 26404 26509 -13189 26378 0 0
# check whether the equality model can be improved
model_equality_2 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z + z_functioning_of_government +
(1|country_name),
family = "binomial",
control = control,
data = df_reduced)
anova(model_equality, model_equality_2) # no improvement --> keep model_equality
## Data: df_reduced
## Models:
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model_equality_2: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + z_functioning_of_government + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## model_equality 11 26382 26471 -13180 26360
## model_equality_2 12 26383 26480 -13180 26359 1.0893 1 0.2966
# test for multicollinearity among the two cultural country-level variables
vif_model <- lm(qc19 ~ rainbow_score_z + v2x_egaldem_z, data = df_reduced)
vif(vif_model) # no multicollinearity issues --> keep model_equality as it is
## rainbow_score_z v2x_egaldem_z
## 1.179562 1.179562
# compare with the previous two models
anova(null_model, model1, model_equality)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.669 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.612 2 8.285e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | |
|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |||
| (Intercept) | 0.449* | -1.791*** | -1.763*** |
| (0.186) | (0.180) | (0.122) | |
| age_z | -0.082*** | -0.083*** | |
| (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | |
| (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | |
| (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | |
| (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | |
| (0.036) | (0.036) | ||
| trans_friends | 0.406*** | 0.407*** | |
| (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | |
| (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | ||
| (0.091) | |||
| v2x_egaldem_z | 0.480*** | ||
| (0.095) | |||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 |
| Num.Obs. | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 |
| BIC | 28228.8 | 26483.8 | 26471.4 |
Model 2 improves fit yet again by adding country-level predictors (χ² = 33.7, p < 0.001, AIC reduced by 29.7 points). The strongest predictors are LGB friends (0.88), egalitarian democracy (0.49), transgender friends (0.42), and LGBTQ legal protections (rainbow score, 0.37), while individual-level predictors remained consistent with Model 1.
# STEP 4: Add Random Slopes for Individual-Level Predictors
# Let's add random slopes for lgb_friends which has shown a strong effect
model3 <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z +
(1 + lgb_friends|country_name),
family = "binomial",
data = df_reduced)
summary(model3)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + (1 + lgb_friends | country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26379.2 26484.4 -13176.6 26353.2 24145
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8970 -0.7605 0.3654 0.6560 4.7324
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28167 0.5307
## lgb_friends 0.02986 0.1728 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75034 0.13324 -13.136 < 2e-16 ***
## age_z -0.08369 0.01617 -5.177 2.25e-07 ***
## gender 0.38596 0.03072 12.565 < 2e-16 ***
## religion_z 0.10190 0.01769 5.761 8.36e-09 ***
## political_ideology_z -0.13125 0.01556 -8.437 < 2e-16 ***
## lgb_friends 0.87288 0.05021 17.385 < 2e-16 ***
## trans_friends 0.40446 0.06011 6.728 1.72e-11 ***
## respondent_cooperation_z -0.26967 0.01612 -16.734 < 2e-16 ***
## rainbow_score_z 0.36167 0.09015 4.012 6.02e-05 ***
## v2x_egaldem_z 0.47996 0.09428 5.091 3.57e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.332 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.046 0.007 0.070
## lgb_friends -0.466 0.158 -0.029 -0.059 0.027
## trans_frnds -0.400 0.036 -0.005 -0.025 0.015 -0.166
## rspndnt_cp_ -0.028 -0.078 -0.024 -0.003 -0.004 0.048 0.018
## ranbw_scr_z 0.030 -0.032 0.013 -0.003 0.014 -0.029 -0.008 0.001
## v2x_egldm_z 0.006 -0.012 0.012 -0.020 0.011 -0.008 0.012 0.011 -0.346
# compare with previous three models
anova(null_model, model1, model_equality, model3)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality,
"Model 4" = model3),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | Model 4 | |
|---|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||||
| (Intercept) | 0.449* | -1.791*** | -1.763*** | -1.750*** |
| (0.186) | (0.180) | (0.122) | (0.133) | |
| age_z | -0.082*** | -0.083*** | -0.084*** | |
| (0.016) | (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | 0.386*** | |
| (0.031) | (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | 0.102*** | |
| (0.018) | (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | -0.131*** | |
| (0.016) | (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | 0.873*** | |
| (0.036) | (0.036) | (0.050) | ||
| trans_friends | 0.406*** | 0.407*** | 0.404*** | |
| (0.060) | (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | -0.270*** | |
| (0.016) | (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | 0.362*** | ||
| (0.091) | (0.090) | |||
| v2x_egaldem_z | 0.480*** | 0.480*** | ||
| (0.095) | (0.094) | |||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 | 0.531 |
| SD (lgb_friends country_name) | 0.173 | |||
| Cor (Intercept~lgb_friends country_name) | -0.532 | |||
| Num.Obs. | 24158 | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 | 26379.2 |
| BIC | 28228.8 | 26483.8 | 26471.4 | 26484.4 |
Model 3 slightly improves fit by adding random slopes for LGB friends (χ² = 7.92, p = 0.019). There is modest cross-country variation in the effect of having LGB friends (SD = 0.176), with a negative correlation (-0.48) between intercepts and slopes indicating stronger effects in countries with lower baseline support.
# STEP 5: Add Cross-Level Interactions
# Assuming the random slope for lgb_friends is significant
# We'll test cross-level interactions with both country-level predictors
model4a <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z + v2x_egaldem_z +
lgb_friends:rainbow_score_z +
(1 + lgb_friends|country_name),
family = "binomial", data = df_reduced)
summary(model4a)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends |
## country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26381.2 26494.5 -13176.6 26353.2 24144
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.8950 -0.7604 0.3654 0.6560 4.7324
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28176 0.5308
## lgb_friends 0.02985 0.1728 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7502822 0.1332809 -13.132 < 2e-16 ***
## age_z -0.0836921 0.0161660 -5.177 2.25e-07 ***
## gender 0.3859664 0.0307213 12.563 < 2e-16 ***
## religion_z 0.1019082 0.0176897 5.761 8.37e-09 ***
## political_ideology_z -0.1312573 0.0155631 -8.434 < 2e-16 ***
## lgb_friends 0.8728531 0.0502206 17.380 < 2e-16 ***
## trans_friends 0.4044711 0.0601146 6.728 1.72e-11 ***
## respondent_cooperation_z -0.2696768 0.0161184 -16.731 < 2e-16 ***
## rainbow_score_z 0.3625479 0.1144465 3.168 0.00154 **
## v2x_egaldem_z 0.4799307 0.0943035 5.089 3.60e-07 ***
## lgb_friends:rainbow_score_z -0.0005866 0.0481628 -0.012 0.99028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.331 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.046 0.007 0.070
## lgb_friends -0.466 0.159 -0.030 -0.059 0.028
## trans_frnds -0.400 0.036 -0.005 -0.025 0.015 -0.167
## rspndnt_cp_ -0.029 -0.078 -0.024 -0.003 -0.003 0.049 0.017
## ranbw_scr_z 0.038 -0.026 0.020 0.006 -0.007 -0.038 -0.002 -0.012
## v2x_egldm_z 0.006 -0.012 0.012 -0.020 0.012 -0.007 0.012 0.011 -0.286
## lgb_frnd:__ -0.024 0.002 -0.016 -0.013 0.029 0.025 -0.008 0.020 -0.616
## v2x_g_
## age_z
## gender
## religion_z
## pltcl_dlgy_
## lgb_friends
## trans_frnds
## rspndnt_cp_
## ranbw_scr_z
## v2x_egldm_z
## lgb_frnd:__ 0.022
# Alternative interaction with v2x_egaldem
model4b <- glmer(qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z +
rainbow_score_z+ v2x_egaldem_z +
lgb_friends:v2x_egaldem_z +
(1 + lgb_friends|country_name),
family = "binomial",
data = df_reduced)
summary(model4b)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends +
## trans_friends + respondent_cooperation_z + rainbow_score_z +
## v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends |
## country_name)
## Data: df_reduced
##
## AIC BIC logLik deviance df.resid
## 26380.7 26494.0 -13176.3 26352.7 24144
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.9289 -0.7612 0.3665 0.6548 4.7850
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## country_name (Intercept) 0.28103 0.5301
## lgb_friends 0.02859 0.1691 -0.53
## Number of obs: 24158, groups: country_name, 28
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74930 0.13319 -13.134 < 2e-16 ***
## age_z -0.08385 0.01617 -5.187 2.14e-07 ***
## gender 0.38602 0.03072 12.566 < 2e-16 ***
## religion_z 0.10193 0.01769 5.763 8.24e-09 ***
## political_ideology_z -0.13156 0.01556 -8.453 < 2e-16 ***
## lgb_friends 0.87464 0.04979 17.568 < 2e-16 ***
## trans_friends 0.40301 0.06009 6.706 1.99e-11 ***
## respondent_cooperation_z -0.26977 0.01612 -16.738 < 2e-16 ***
## rainbow_score_z 0.36184 0.09011 4.016 5.93e-05 ***
## v2x_egaldem_z 0.53462 0.11836 4.517 6.27e-06 ***
## lgb_friends:v2x_egaldem_z -0.03826 0.05046 -0.758 0.448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) age_z gender rlgn_z pltc__ lgb_fr trns_f rspn__ rnbw__
## age_z -0.098
## gender -0.332 -0.022
## religion_z 0.018 0.120 0.098
## pltcl_dlgy_ -0.031 0.047 0.007 0.070
## lgb_friends -0.464 0.159 -0.030 -0.059 0.026
## trans_frnds -0.400 0.037 -0.005 -0.025 0.016 -0.169
## rspndnt_cp_ -0.028 -0.077 -0.024 -0.003 -0.004 0.048 0.018
## ranbw_scr_z 0.030 -0.032 0.012 -0.003 0.014 -0.029 -0.008 0.001
## v2x_egldm_z 0.019 -0.022 0.012 -0.016 -0.010 0.008 -0.010 0.004 -0.269
## lgb_frn:2__ -0.012 0.013 -0.004 -0.002 0.026 -0.046 0.033 0.009 -0.005
## v2x_g_
## age_z
## gender
## religion_z
## pltcl_dlgy_
## lgb_friends
## trans_frnds
## rspndnt_cp_
## ranbw_scr_z
## v2x_egldm_z
## lgb_frn:2__ -0.616
# compare with previous models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## model4a 14 26381 26494 -13177 26353 0.0001 1 0.99024
## model4b 14 26381 26494 -13176 26353 0.5685 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modelsummary(
list("Model 1" = null_model,
"Model 2" = model1,
"Model 3" = model_equality,
"Model 4" = model3,
"Model 5a" = model4a,
"Model 5b" = model4b),
stars = TRUE,
gof_map = c("nobs", "aic", "bic", "logLik", "r.squared"))
| Model 1 | Model 2 | Model 3 | Model 4 | Model 5a | Model 5b | |
|---|---|---|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||||||
| (Intercept) | 0.449* | -1.791*** | -1.763*** | -1.750*** | -1.750*** | -1.749*** |
| (0.186) | (0.180) | (0.122) | (0.133) | (0.133) | (0.133) | |
| age_z | -0.082*** | -0.083*** | -0.084*** | -0.084*** | -0.084*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| gender | 0.383*** | 0.384*** | 0.386*** | 0.386*** | 0.386*** | |
| (0.031) | (0.031) | (0.031) | (0.031) | (0.031) | ||
| religion_z | 0.105*** | 0.102*** | 0.102*** | 0.102*** | 0.102*** | |
| (0.018) | (0.018) | (0.018) | (0.018) | (0.018) | ||
| political_ideology_z | -0.133*** | -0.132*** | -0.131*** | -0.131*** | -0.132*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| lgb_friends | 0.882*** | 0.876*** | 0.873*** | 0.873*** | 0.875*** | |
| (0.036) | (0.036) | (0.050) | (0.050) | (0.050) | ||
| trans_friends | 0.406*** | 0.407*** | 0.404*** | 0.404*** | 0.403*** | |
| (0.060) | (0.060) | (0.060) | (0.060) | (0.060) | ||
| respondent_cooperation_z | -0.271*** | -0.269*** | -0.270*** | -0.270*** | -0.270*** | |
| (0.016) | (0.016) | (0.016) | (0.016) | (0.016) | ||
| rainbow_score_z | 0.354*** | 0.362*** | 0.363** | 0.362*** | ||
| (0.091) | (0.090) | (0.114) | (0.090) | |||
| v2x_egaldem_z | 0.480*** | 0.480*** | 0.480*** | 0.535*** | ||
| (0.095) | (0.094) | (0.094) | (0.118) | |||
| lgb_friends × rainbow_score_z | -0.001 | |||||
| (0.048) | ||||||
| lgb_friends × v2x_egaldem_z | -0.038 | |||||
| (0.050) | ||||||
| SD (Intercept country_name) | 0.982 | 0.834 | 0.458 | 0.531 | 0.531 | 0.530 |
| SD (lgb_friends country_name) | 0.173 | 0.173 | 0.169 | |||
| Cor (Intercept~lgb_friends country_name) | -0.532 | -0.532 | -0.532 | |||
| Num.Obs. | 24158 | 24158 | 24158 | 24158 | 24158 | 24158 |
| AIC | 28212.6 | 26411.0 | 26382.4 | 26379.2 | 26381.2 | 26380.7 |
| BIC | 28228.8 | 26483.8 | 26471.4 | 26484.4 | 26494.5 | 26494.0 |
Testing cross-level interactions shows that neither the interaction between LGB friends and Rainbow score (p = 0.90) nor between LGB friends and egalitarian democracy (p = 0.41) is significant. Adding these interaction terms actually worsens model fit (AIC increases).
# compare all models
anova(null_model, model1, model_equality, model3, model4a, model4b)
## Data: df_reduced
## Models:
## null_model: qc19 ~ 1 + (1 | country_name)
## model1: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + (1 | country_name)
## model_equality: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 | country_name)
## model3: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + (1 + lgb_friends | country_name)
## model4a: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:rainbow_score_z + (1 + lgb_friends | country_name)
## model4b: qc19 ~ age_z + gender + religion_z + political_ideology_z + lgb_friends + trans_friends + respondent_cooperation_z + rainbow_score_z + v2x_egaldem_z + lgb_friends:v2x_egaldem_z + (1 + lgb_friends | country_name)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## null_model 2 28213 28229 -14104 28209
## model1 9 26411 26484 -13196 26393 1815.6686 7 < 2.2e-16 ***
## model_equality 11 26382 26471 -13180 26360 32.6125 2 8.285e-08 ***
## model3 13 26379 26484 -13177 26353 7.1246 2 0.02837 *
## model4a 14 26381 26494 -13177 26353 0.0001 1 0.99024
## model4b 14 26381 26494 -13176 26353 0.5685 0
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# use Stargazer to display the output in a nice table
model_stats_extended <- function(models, model_names) {
result <- data.frame(
Model = character(),
AIC = numeric(),
BIC = numeric(),
LogLik = numeric(),
Deviance = numeric(),
ChiSq = numeric(),
Df = numeric(),
p = numeric(),
ICC = numeric(),
stringsAsFactors = FALSE
)
# Extract basic statistics
for (i in 1:length(models)) {
model <- models[[i]]
result[i, "Model"] <- model_names[i]
result[i, "AIC"] <- AIC(model)
result[i, "BIC"] <- BIC(model)
result[i, "LogLik"] <- as.numeric(logLik(model))
result[i, "Deviance"] <- deviance(model)
result[i, "ICC"] <- performance::icc(model)$ICC_conditional
}
# Extract chi-square statistics from ANOVA comparison
anova_result <- anova(models[[1]], models[[2]], models[[3]], models[[4]], models[[5]], models[[6]])
for (i in 2:length(models)) {
result[i, "ChiSq"] <- anova_result$Chisq[i]
result[i, "Df"] <- anova_result$Df[i]
result[i, "p"] <- anova_result$`Pr(>Chisq)`[i]
}
return(result)
}
# for the labels
model_names <- c("Null model", "Individual predictors", "Country-level predictors",
"Random slopes", "Interaction (Rainbow)", "Interaction (Egalitarian)")
# extract statistics
all_models_stats <- model_stats_extended(
list(null_model, model1, model_equality, model3, model4a, model4b),
model_names)
# split into two separate tables because one would be too wide
main_models_table <- stargazer(null_model, model1, model_equality, model3,
type = "text",
title = "Main Multilevel Models for Transgender Document Change Support",
column.labels = model_names[1:4],
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
intercept.bottom = FALSE,
single.row = FALSE,
model.numbers = FALSE,
notes.append = FALSE,
notes = "* p<0.05; ** p<0.01; *** p<0.001",
add.lines = list(
c("AIC", round(all_models_stats$AIC[1:4], 1)),
c("BIC", round(all_models_stats$BIC[1:4], 1)),
c("Log Likelihood", round(all_models_stats$LogLik[1:4], 1)),
c("Chi-square", c("", round(all_models_stats$ChiSq[2:4], 2))),
c("df", c("", all_models_stats$Df[2:4])),
c("p-value", c("", format.pval(all_models_stats$p[2:4], digits = 3))),
c("ICC", round(all_models_stats$ICC[1:4], 3))))
##
## Main Multilevel Models for Transgender Document Change Support
## =================================================================================================
## Dependent variable:
## ------------------------------------------------------------------------
## qc19
## Null model Individual predictors Country-level predictors Random slopes
## -------------------------------------------------------------------------------------------------
## Constant 0.449* -1.791*** -1.763*** -1.750***
## (0.186) (0.180) (0.122) (0.133)
##
## age_z -0.082*** -0.083*** -0.084***
## (0.016) (0.016) (0.016)
##
## gender 0.383*** 0.384*** 0.386***
## (0.031) (0.031) (0.031)
##
## religion_z 0.105*** 0.102*** 0.102***
## (0.018) (0.018) (0.018)
##
## political_ideology_z -0.133*** -0.132*** -0.131***
## (0.016) (0.016) (0.016)
##
## lgb_friends 0.882*** 0.876*** 0.873***
## (0.036) (0.036) (0.050)
##
## trans_friends 0.406*** 0.407*** 0.404***
## (0.060) (0.060) (0.060)
##
## respondent_cooperation_z -0.271*** -0.269*** -0.270***
## (0.016) (0.016) (0.016)
##
## rainbow_score_z 0.354*** 0.362***
## (0.091) (0.090)
##
## v2x_egaldem_z 0.480*** 0.480***
## (0.095) (0.094)
##
## -------------------------------------------------------------------------------------------------
## AIC 28212.6 26411 26382.4 26379.2
## BIC 28228.8 26483.8 26471.4 26484.4
## Log Likelihood -14104.3 -13196.5 -13180.2 -13176.6
## Chi-square 1815.67 32.61 7.12
## df 7 2 2
## p-value < 2e-16 8.28e-08 0.0284
## ICC 0.227 0.154 0.043 0.043
## Observations 24,158 24,158 24,158 24,158
## Log Likelihood -14,104.320 -13,196.490 -13,180.180 -13,176.620
## Akaike Inf. Crit. 28,212.650 26,410.980 26,382.360 26,379.240
## Bayesian Inf. Crit. 28,228.830 26,483.810 26,471.380 26,484.440
## =================================================================================================
## Note: * p<0.05; ** p<0.01; *** p<0.001
interaction_models_table <- stargazer(model3, model4a, model4b,
type = "text",
title = "Interaction Models for Transgender Document Change Support",
column.labels = model_names[4:6],
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
intercept.bottom = FALSE,
single.row = FALSE,
model.numbers = FALSE,
notes.append = FALSE,
notes = "* p<0.05; ** p<0.01; *** p<0.001",
add.lines = list(
c("AIC", round(all_models_stats$AIC[4:6], 1)),
c("BIC", round(all_models_stats$BIC[4:6], 1)),
c("Log Likelihood", round(all_models_stats$LogLik[4:6], 1)),
c("Chi-square", c("", round(all_models_stats$ChiSq[5:6], 2))),
c("df", c("", all_models_stats$Df[5:6])),
c("p-value", c("", format.pval(all_models_stats$p[5:6], digits = 3))),
c("ICC", round(all_models_stats$ICC[4:6], 3))))
##
## Interaction Models for Transgender Document Change Support
## =========================================================================================
## Dependent variable:
## -------------------------------------------------------------
## qc19
## Random slopes Interaction (Rainbow) Interaction (Egalitarian)
## -----------------------------------------------------------------------------------------
## Constant -1.750*** -1.750*** -1.749***
## (0.133) (0.133) (0.133)
##
## age_z -0.084*** -0.084*** -0.084***
## (0.016) (0.016) (0.016)
##
## gender 0.386*** 0.386*** 0.386***
## (0.031) (0.031) (0.031)
##
## religion_z 0.102*** 0.102*** 0.102***
## (0.018) (0.018) (0.018)
##
## political_ideology_z -0.131*** -0.131*** -0.132***
## (0.016) (0.016) (0.016)
##
## lgb_friends 0.873*** 0.873*** 0.875***
## (0.050) (0.050) (0.050)
##
## trans_friends 0.404*** 0.404*** 0.403***
## (0.060) (0.060) (0.060)
##
## respondent_cooperation_z -0.270*** -0.270*** -0.270***
## (0.016) (0.016) (0.016)
##
## rainbow_score_z 0.362*** 0.363** 0.362***
## (0.090) (0.114) (0.090)
##
## v2x_egaldem_z 0.480*** 0.480*** 0.535***
## (0.094) (0.094) (0.118)
##
## lgb_friends:rainbow_score_z -0.001
## (0.048)
##
## lgb_friends:v2x_egaldem_z -0.038
## (0.050)
##
## -----------------------------------------------------------------------------------------
## AIC 26379.2 26381.2 26380.7
## BIC 26484.4 26494.5 26494
## Log Likelihood -13176.6 -13176.6 -13176.3
## Chi-square 0 0.57
## df 1 0
## p-value 0.99 NA
## ICC 0.043 0.043 0.043
## Observations 24,158 24,158 24,158
## Log Likelihood -13,176.620 -13,176.620 -13,176.330
## Akaike Inf. Crit. 26,379.240 26,381.240 26,380.670
## Bayesian Inf. Crit. 26,484.440 26,494.530 26,493.960
## =========================================================================================
## Note: * p<0.05; ** p<0.01; *** p<0.001
# Get anova results for interaction models
anova_results_interactions <- anova(model3, model4a, model4b)
stargazer(model3, model4a, model4b,
type = "html",
title = "Interaction Models for Transgender Document Change Support",
column.labels = c("Random Slopes", "Rainbow Interaction", "Egalitarian Interaction"),
covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)",
"Political ideology (z-score)", "Has LGB friends",
"Has transgender friends", "Respondent cooperation (z-score)",
"Rainbow score (z-score)", "Egalitarian democracy (z-score)",
"LGB friends × Rainbow score",
"LGB friends × Egalitarian democracy"),
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
add.lines = list(
c("Chi-square", "",
round(anova_results_interactions$Chisq[2], 2),
round(anova_results_interactions$Chisq[3], 2)),
c("df", "",
anova_results_interactions$Df[2],
anova_results_interactions$Df[3]),
c("p-value", "",
format.pval(anova_results_interactions$`Pr(>Chisq)`[2], digits = 3),
format.pval(anova_results_interactions$`Pr(>Chisq)`[3], digits = 3)),
c("ICC",
round(performance::icc(model3)$ICC_conditional, 3),
round(performance::icc(model4a)$ICC_conditional, 3),
round(performance::icc(model4b)$ICC_conditional, 3))),
notes = "* p<0.05; ** p<0.01; *** p<0.001",
out = "interaction_models.html")
##
## <table style="text-align:center"><caption><strong>Interaction Models for Transgender Document Change Support</strong></caption>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td colspan="3"><em>Dependent variable:</em></td></tr>
## <tr><td></td><td colspan="3" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td colspan="3">qc19</td></tr>
## <tr><td style="text-align:left"></td><td>Random Slopes</td><td>Rainbow Interaction</td><td>Egalitarian Interaction</td></tr>
## <tr><td style="text-align:left"></td><td>(1)</td><td>(2)</td><td>(3)</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Age (z-score)</td><td>-0.084<sup>***</sup></td><td>-0.084<sup>***</sup></td><td>-0.084<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Gender</td><td>0.386<sup>***</sup></td><td>0.386<sup>***</sup></td><td>0.386<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.031)</td><td>(0.031)</td><td>(0.031)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Religion (z-score)</td><td>0.102<sup>***</sup></td><td>0.102<sup>***</sup></td><td>0.102<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.018)</td><td>(0.018)</td><td>(0.018)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Political ideology (z-score)</td><td>-0.131<sup>***</sup></td><td>-0.131<sup>***</sup></td><td>-0.132<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Has LGB friends</td><td>0.873<sup>***</sup></td><td>0.873<sup>***</sup></td><td>0.875<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.050)</td><td>(0.050)</td><td>(0.050)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Has transgender friends</td><td>0.404<sup>***</sup></td><td>0.404<sup>***</sup></td><td>0.403<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.060)</td><td>(0.060)</td><td>(0.060)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Respondent cooperation (z-score)</td><td>-0.270<sup>***</sup></td><td>-0.270<sup>***</sup></td><td>-0.270<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.016)</td><td>(0.016)</td><td>(0.016)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Rainbow score (z-score)</td><td>0.362<sup>***</sup></td><td>0.363<sup>**</sup></td><td>0.362<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.090)</td><td>(0.114)</td><td>(0.090)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Egalitarian democracy (z-score)</td><td>0.480<sup>***</sup></td><td>0.480<sup>***</sup></td><td>0.535<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.094)</td><td>(0.094)</td><td>(0.118)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">LGB friends × Rainbow score</td><td></td><td>-0.001</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td>(0.048)</td><td></td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">LGB friends × Egalitarian democracy</td><td></td><td></td><td>-0.038</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td>(0.050)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>-1.750<sup>***</sup></td><td>-1.750<sup>***</sup></td><td>-1.749<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.133)</td><td>(0.133)</td><td>(0.133)</td></tr>
## <tr><td style="text-align:left"></td><td></td><td></td><td></td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Chi-square</td><td></td><td>0</td><td>0.57</td></tr>
## <tr><td style="text-align:left">df</td><td></td><td>1</td><td>0</td></tr>
## <tr><td style="text-align:left">p-value</td><td></td><td>0.99</td><td>NA</td></tr>
## <tr><td style="text-align:left">ICC</td><td>0.043</td><td>0.043</td><td>0.043</td></tr>
## <tr><td style="text-align:left">Observations</td><td>24,158</td><td>24,158</td><td>24,158</td></tr>
## <tr><td style="text-align:left">Log Likelihood</td><td>-13,176.620</td><td>-13,176.620</td><td>-13,176.330</td></tr>
## <tr><td style="text-align:left">Akaike Inf. Crit.</td><td>26,379.240</td><td>26,381.240</td><td>26,380.670</td></tr>
## <tr><td style="text-align:left">Bayesian Inf. Crit.</td><td>26,484.440</td><td>26,494.530</td><td>26,493.960</td></tr>
## <tr><td colspan="4" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"><em>Note:</em></td><td colspan="3" style="text-align:right"><sup>*</sup>p<0.05; <sup>**</sup>p<0.01; <sup>***</sup>p<0.001</td></tr>
## <tr><td style="text-align:left"></td><td colspan="3" style="text-align:right">* p<0.05; ** p<0.01; *** p<0.001</td></tr>
## </table>
# Get anova results for all models
anova_results_all <- anova(null_model, model1, model_equality, model3, model4a, model4b)
stargazer(null_model, model1, model_equality, model3, model4a, model4b,
type = "text",
title = "Multilevel Models for Transgender Document Change Support",
column.labels = c("Null", "Individual", "Country", "Random Slopes",
"Rainbow Int.", "Egalitarian Int."),
covariate.labels = c("Age (z-score)", "Gender", "Religion (z-score)",
"Political ideology (z-score)", "Has LGB friends",
"Has transgender friends", "Respondent cooperation (z-score)",
"Rainbow score (z-score)", "Egalitarian democracy (z-score)",
"LGB friends × Rainbow score",
"LGB friends × Egalitarian democracy"),
digits = 3,
star.cutoffs = c(0.05, 0.01, 0.001),
add.lines = list(
c("Chi-square", "",
round(anova_results_all$Chisq[2], 2),
round(anova_results_all$Chisq[3], 2),
round(anova_results_all$Chisq[4], 2),
round(anova_results_all$Chisq[5], 2),
round(anova_results_all$Chisq[6], 2)),
c("df", "",
anova_results_all$Df[2],
anova_results_all$Df[3],
anova_results_all$Df[4],
anova_results_all$Df[5],
anova_results_all$Df[6]),
c("p-value", "",
format.pval(anova_results_all$`Pr(>Chisq)`[2], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[3], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[4], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[5], digits = 3),
format.pval(anova_results_all$`Pr(>Chisq)`[6], digits = 3)),
c("ICC",
round(performance::icc(null_model)$ICC_conditional, 3),
round(performance::icc(model1)$ICC_conditional, 3),
round(performance::icc(model_equality)$ICC_conditional, 3),
round(performance::icc(model3)$ICC_conditional, 3),
round(performance::icc(model4a)$ICC_conditional, 3),
round(performance::icc(model4b)$ICC_conditional, 3))),
notes = "* p<0.05; ** p<0.01; *** p<0.001",
font.size = "small", # Use smaller font to help fit the table
out = "all_models.txt")
##
## Multilevel Models for Transgender Document Change Support
## ===================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------
## qc19
## Null Individual Country Random Slopes Rainbow Int. Egalitarian Int.
## (1) (2) (3) (4) (5) (6)
## -------------------------------------------------------------------------------------------------------------------
## Age (z-score) -0.082*** -0.083*** -0.084*** -0.084*** -0.084***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Gender 0.383*** 0.384*** 0.386*** 0.386*** 0.386***
## (0.031) (0.031) (0.031) (0.031) (0.031)
##
## Religion (z-score) 0.105*** 0.102*** 0.102*** 0.102*** 0.102***
## (0.018) (0.018) (0.018) (0.018) (0.018)
##
## Political ideology (z-score) -0.133*** -0.132*** -0.131*** -0.131*** -0.132***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Has LGB friends 0.882*** 0.876*** 0.873*** 0.873*** 0.875***
## (0.036) (0.036) (0.050) (0.050) (0.050)
##
## Has transgender friends 0.406*** 0.407*** 0.404*** 0.404*** 0.403***
## (0.060) (0.060) (0.060) (0.060) (0.060)
##
## Respondent cooperation (z-score) -0.271*** -0.269*** -0.270*** -0.270*** -0.270***
## (0.016) (0.016) (0.016) (0.016) (0.016)
##
## Rainbow score (z-score) 0.354*** 0.362*** 0.363** 0.362***
## (0.091) (0.090) (0.114) (0.090)
##
## Egalitarian democracy (z-score) 0.480*** 0.480*** 0.480*** 0.535***
## (0.095) (0.094) (0.094) (0.118)
##
## LGB friends × Rainbow score -0.001
## (0.048)
##
## LGB friends × Egalitarian democracy -0.038
## (0.050)
##
## Constant 0.449* -1.791*** -1.763*** -1.750*** -1.750*** -1.749***
## (0.186) (0.180) (0.122) (0.133) (0.133) (0.133)
##
## -------------------------------------------------------------------------------------------------------------------
## Chi-square 1815.67 32.61 7.12 0 0.57
## df 7 2 2 1 0
## p-value <2e-16 8.28e-08 0.0284 0.99 NA
## ICC 0.227 0.154 0.043 0.043 0.043 0.043
## Observations 24,158 24,158 24,158 24,158 24,158 24,158
## Log Likelihood -14,104.320 -13,196.490 -13,180.180 -13,176.620 -13,176.620 -13,176.330
## Akaike Inf. Crit. 28,212.650 26,410.980 26,382.360 26,379.240 26,381.240 26,380.670
## Bayesian Inf. Crit. 28,228.830 26,483.810 26,471.380 26,484.440 26,494.530 26,493.960
## ===================================================================================================================
## Note: *p<0.05; **p<0.01; ***p<0.001
## * p<0.05; ** p<0.01; *** p<0.001
Model 3 (random slopes for LGB friends) is the best model, as it significantly improves fit over simpler models (χ² = 7.92, p = 0.019) and achieves the lowest AIC (26472.5). Subsequent interaction models fail to improve fit (p = 0.90, p = 0.41) and slightly increase AIC, confirming Model 3 as the most robust choice.
# For logistic regression, the level-1 variance is fixed at π²/3
pi_squared_by_3 <- (pi^2)/3 # approximately 3.29
# Get variance components from null model
var_null <- as.data.frame(VarCorr(null_model))
between_var_null <- var_null$vcov[1] # Between-country variance
within_var_null <- pi_squared_by_3 # Fixed residual variance for logistic models
# Get variance components from final model (model3)
var_model3 <- as.data.frame(VarCorr(model3))
between_var_model3 <- var_model3$vcov[1] # Between-country variance
within_var_model3 <- pi_squared_by_3 # Still fixed for the final model
# Calculate proportion of between-country variance explained
between_var_explained <- (between_var_null - between_var_model3) / between_var_null
# For binomial models, we can't directly calculate within-country variance explained
# We can use an approximation based on the R² measure for GLMMs
# Calculate total variance in both models
total_var_null <- between_var_null + within_var_null
total_var_model3 <- between_var_model3 + within_var_model3
# Calculate proportion of total variance explained
total_var_explained <- 1 - (total_var_model3 / total_var_null)
# Alternatively, use MuMIn package for R² calculation
library(MuMIn)
r2_model3 <- r.squaredGLMM(model3)
## Warning: the null model is only correct if all the variables it uses are identical
## to those used in fitting the original model.
# This returns two values: R²m (marginal - fixed effects only) and R²c (conditional - fixed + random effects)
cat("Proportion of between-country variance explained:", round(between_var_explained * 100, 1), "%\n")
## Proportion of between-country variance explained: 70.8 %
cat("Proportion of total variance explained:", round(total_var_explained * 100, 1), "%\n")
## Proportion of total variance explained: 16.1 %
cat("R² marginal (fixed effects only):", round(r2_model3[1], 3), "\n")
## R² marginal (fixed effects only): 0.281
cat("R² conditional (fixed + random):", round(r2_model3[2], 3), "\n")
## R² conditional (fixed + random): 0.241
library(ggplot2)
library(sjPlot)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
# 1. Plot fixed effects with error bars
p1 <- plot_model(model3, sort.est = TRUE, show.values = TRUE, value.offset = 0.3) +
theme_bw() +
labs(title = "Factors Influencing Support for Transgender Document Change",
y = "Log-Odds (95% CI)")
# 2. Create a plot showing country variation
# Extract random effects for each country
re <- ranef(model3)$country_name
re_df <- data.frame(
country = rownames(re),
intercept = re$`(Intercept)`,
lgb_slope = re$lgb_friends
)
# Sort by intercept
re_df <- re_df[order(re_df$intercept), ]
re_df$country <- factor(re_df$country, levels = re_df$country)
# Create the plot
p2 <- ggplot(re_df, aes(x = country, y = intercept)) +
geom_point() +
geom_errorbar(aes(ymin = intercept - 1.96*attr(re, "postVar")[1,1,]^0.5,
ymax = intercept + 1.96*attr(re, "postVar")[1,1,]^0.5),
width = 0.2) +
coord_flip() +
theme_bw() +
labs(title = "Country Differences in Support for Transgender Rights",
subtitle = "Random intercepts with 95% confidence intervals",
x = "",
y = "Random Intercept")
# 3. Create effect plots for key predictors
# For Rainbow Score
p3 <- plot_model(model3, type = "pred", terms = "rainbow_score_z [-2:2]",
title = "Effect of Rainbow Score on Support Level",
axis.title = c("Rainbow Score (z-score)", "Probability of Support")) +
theme_bw()
## You are calculating adjusted predictions on the population-level (i.e.
## `type = "fixed"`) for a *generalized* linear mixed model.
## This may produce biased estimates due to Jensen's inequality. Consider
## setting `bias_correction = TRUE` to correct for this bias.
## See also the documentation of the `bias_correction` argument.
# For Egalitarian Democracy
p4 <- plot_model(model3, type = "pred", terms = "v2x_egaldem_z [-2:2]",
title = "Effect of Egalitarian Democracy on Support Level",
axis.title = c("Egalitarian Democracy (z-score)", "Probability of Support")) +
theme_bw()
## Threshold plot
# (1) for religion
# simple approach to calculate thresholds
thresholds <- data.frame(
country = unique(df_reduced$country_name),
threshold = NA)
# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name
# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(thresholds)) {
country_i <- thresholds$country[i]
# Get country-specific intercept
intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
# Get religiosity coefficient
relig_coef <- fixed_effects["religion_z"]
# Calculate other effects (at reference/mean values)
other_effects <- fixed_effects["lgb_friends"] * 2 # Has LGB friends
if ("trans_friends" %in% names(fixed_effects))
other_effects <- other_effects + fixed_effects["trans_friends"] * 1.5
# Calculate religiosity threshold where log-odds = 0 (probability = 0.5)
# Solve: intercept + relig_coef * threshold + other_effects = 0
thresholds$threshold[i] <- -(intercept + other_effects) / relig_coef
}
# lot the thresholds
p5 <- ggplot(thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
theme_minimal() +
labs(title = "Religiosity Threshold for Supporting Transgender Rights by Country",
subtitle = "Religiosity z-score at which support probability crosses 50%",
x = "",
y = "Religiosity Threshold (z-score)")
# (2) for political ideology
# Calculate political ideology thresholds
pol_thresholds <- data.frame(
country = unique(df_reduced$country_name),
threshold = NA
)
# Extract fixed effects
fixed_effects <- fixef(model3)
random_effects <- ranef(model3)$country_name
# Calculate threshold for each country based on model coefficients
for (i in 1:nrow(pol_thresholds)) {
country_i <- pol_thresholds$country[i]
# Get country-specific intercept
intercept <- fixed_effects["(Intercept)"] + random_effects[country_i, "(Intercept)"]
# Get political ideology coefficient
pol_coef <- fixed_effects["political_ideology_z"]
# Calculate other effects (at reference/mean values)
other_effects <- fixed_effects["lgb_friends"] * 2 + # Has LGB friends
fixed_effects["trans_friends"] * 1.5 +
fixed_effects["religion_z"] * 0 # Set religion to mean
# Calculate ideology threshold where log-odds = 0 (probability = 0.5)
pol_thresholds$threshold[i] <- -(intercept + other_effects) / pol_coef
}
# Plot the thresholds
p6 <- ggplot(pol_thresholds, aes(x = reorder(country, -threshold), y = threshold)) +
geom_bar(stat = "identity", fill = "darkred") +
coord_flip() +
theme_minimal() +
labs(title = "Political Ideology Threshold for Supporting Transgender Rights by Country",
subtitle = "Political ideology z-score at which support probability crosses 50%",
x = "",
y = "Political Ideology Threshold (z-score)")
# Display plots
p1
p2
p3
p4
p5
p6